Files
impala/tests/util/workload_management.py
Joe McDonnell c5a0ec8bdf IMPALA-11980 (part 1): Put all thrift-generated python code into the impala_thrift_gen package
This puts all of the thrift-generated python code into the
impala_thrift_gen package. This is similar to what Impyla
does for its thrift-generated python code, except that it
uses the impala_thrift_gen package rather than impala._thrift_gen.
This is a preparatory patch for fixing the absolute import
issues.

This patches all of the thrift files to add the python namespace.
This has code to apply the patching to the thirdparty thrift
files (hive_metastore.thrift, fb303.thrift) to do the same.

Putting all the generated python into a package makes it easier
to understand where the imports are getting code. When the
subsequent change rearranges the shell code, the thrift generated
code can stay in a separate directory.

This uses isort to sort the imports for the affected Python files
with the provided .isort.cfg file. This also adds an impala-isort
shell script to make it easy to run.

Testing:
 - Ran a core job

Change-Id: Ie2927f22c7257aa38a78084efe5bd76d566493c0
Reviewed-on: http://gerrit.cloudera.org:8080/20169
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
Reviewed-by: Riza Suminto <riza.suminto@cloudera.com>
2025-04-15 17:03:02 +00:00

749 lines
31 KiB
Python

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
from __future__ import absolute_import, division, print_function
import os
import re
import requests
from datetime import datetime
from time import sleep, time
from impala_thrift_gen.SystemTables.ttypes import TQueryTableColumn
from tests.util.assert_time import assert_time_str, convert_to_milliseconds
from tests.util.memory import assert_byte_str, convert_to_bytes
DEDICATED_COORD_SAFETY_BUFFER_BYTES = 104857600
WM_DB = "sys"
QUERY_TBL_LOG_NAME = "impala_query_log"
QUERY_TBL_LOG = "{0}.{1}".format(WM_DB, QUERY_TBL_LOG_NAME)
QUERY_TBL_LIVE_NAME = "impala_query_live"
QUERY_TBL_LIVE = "{0}.{1}".format(WM_DB, QUERY_TBL_LIVE_NAME)
# Time in seconds the assert_query and assert_csv_col will wait for the query to become
# available in the relevant workload management table.
ASSERT_QUERY_TIMEOUT_S = 30
# String parsing format for query start/end time fields in a text query profile.
QUERY_PROFILE_DT_FORMAT = "%Y-%m-%d %H:%M:%S.%f"
def round_to_3(val):
# The differences between round in Python 2 and Python 3 do not matter here.
# pylint: disable=round-builtin
return round(val, 3)
def assert_query(query_tbl, client, expected_cluster_id="", raw_profile=None,
impalad=None, query_id=None, max_mem_for_admission=None, max_row_size=None,
expected_overrides={}):
"""Helper function to assert that the values in the completed query log table
match the values from the query profile."""
ret_data = {}
# If query_id was specified, read the profile from the Impala webserver.
if query_id is not None:
assert impalad is not None
assert raw_profile is None, "cannot specify both query_id and raw_profile"
resp = requests.get("http://{0}:{1}/query_profile_plain_text?query_id={2}"
.format(impalad.hostname, impalad.get_webserver_port(), query_id))
assert resp.status_code == 200, "Response code was: {0}".format(resp.status_code)
profile_text = resp.text
else:
profile_text = raw_profile
assert query_id is None, "cannot specify both raw_profile and query_id"
match = re.search(r'Query \(id=(.*?)\)', profile_text)
assert match is not None
query_id = match.group(1)
print("Query Id: {0}".format(query_id))
profile_lines = profile_text.split("\n")
success = False
sql_results = None
start_time = time()
while (time() - start_time <= ASSERT_QUERY_TIMEOUT_S):
# Force Impala to process the inserts to the completed queries table.
if query_tbl != QUERY_TBL_LIVE:
client.execute("refresh " + query_tbl)
# Assert the query was written correctly to the query log table.
if max_row_size is not None:
client.set_configuration_option("MAX_ROW_SIZE", max_row_size)
sql_results = client.execute("select * from {0} where query_id='{1}'".format(
query_tbl, query_id))
if sql_results.success and len(sql_results.data) == 1:
success = True
break
# Query is not yet available in the workload management table, wait and try again.
sleep(1)
assert success, "Did not find query '{}' in the '{}' table after multiple attempts" \
.format(query_id, query_tbl)
# Assert the expected columns were included.
assert len(sql_results.column_labels) == len(TQueryTableColumn._VALUES_TO_NAMES)
data = sql_results.data[0].split("\t")
assert len(data) == len(sql_results.column_labels)
def column_val(index):
name = TQueryTableColumn._VALUES_TO_NAMES[index]
assert sql_results.column_labels[index] == name
ret_data[name] = data[index]
return data[index]
def assert_col(col, profile_re):
"""Asserts the value of a single column matches the expected value retrieved from the
profile regular expression. Uses the value specified in expected_overrides if a
value exists for the specified column."""
value = column_val(col)
if col in expected_overrides:
assert value == expected_overrides[col]
else:
columns = re.search(profile_re, profile_text)
if columns is not None:
assert value == columns.group(1)
else:
assert value == ""
# Cluster ID
assert column_val(TQueryTableColumn.CLUSTER_ID) == expected_cluster_id,\
"cluster id incorrect"
# Query ID
assert column_val(TQueryTableColumn.QUERY_ID) == query_id
# Session ID
session_id = re.search(r'\n\s+Session ID:\s+(.*)\n', profile_text)
assert session_id is not None
assert column_val(TQueryTableColumn.SESSION_ID) == session_id.group(1),\
"session id incorrect"
# Session Type
session_type = re.search(r'\n\s+Session Type:\s+(.*)\n', profile_text)
assert session_type is not None
assert column_val(TQueryTableColumn.SESSION_TYPE) == session_type.group(1),\
"session type incorrect"
# HS2 Protocol Version
value = column_val(TQueryTableColumn.HIVESERVER2_PROTOCOL_VERSION)
if session_type.group(1) == "HIVESERVER2":
hs2_ver = re.search(r'\n\s+HiveServer2 Protocol Version:\s+(.*)', profile_text)
assert hs2_ver is not None
assert value == hs2_ver.group(1)
else:
assert value == ""
# Database User
user = re.search(r'\n\s+User:\s+(.*?)\n', profile_text)
assert user is not None
assert column_val(TQueryTableColumn.DB_USER) == user.group(1), "db user incorrect"
# Connected Database User
db_user = re.search(r'\n\s+Connected User:\s+(.*?)\n', profile_text)
assert db_user is not None
assert column_val(TQueryTableColumn.DB_USER_CONNECTION) == db_user.group(1),\
"db user connection incorrect"
# Database Name
default_db = re.search(r'\n\s+Default Db:\s+(.*?)\n', profile_text)
assert default_db is not None
assert column_val(TQueryTableColumn.DB_NAME) == default_db.group(1),\
"database name incorrect"
# Coordinator
coordinator = re.search(r'\n\s+Coordinator:\s+(.*?)\n', profile_text)
assert coordinator is not None
assert column_val(TQueryTableColumn.IMPALA_COORDINATOR) == coordinator.group(1),\
"impala coordinator incorrect"
# Query Status (can be multiple lines if the query errored)
query_status = re.search(r'\n\s+Query Status:\s+(.*?)\n\s+Impala Version', profile_text,
re.DOTALL)
assert query_status is not None
assert column_val(TQueryTableColumn.QUERY_STATUS) == query_status.group(1),\
"query status incorrect"
# Query State
query_state = re.search(r'\n\s+Query State:\s+(.*?)\n', profile_text)
assert query_state is not None
query_state_value = query_state.group(1)
assert column_val(TQueryTableColumn.QUERY_STATE) == query_state_value,\
"query state incorrect"
# Impala Query End State
impala_query_state = re.search(r'\n\s+Impala Query State:\s+(.*?)\n', profile_text)
assert impala_query_state is not None
assert column_val(TQueryTableColumn.IMPALA_QUERY_END_STATE) \
== impala_query_state.group(1), "impala query end state incorrect"
# Query Type
value = column_val(TQueryTableColumn.QUERY_TYPE)
if query_state_value == "EXCEPTION":
assert value == "UNKNOWN", "query type incorrect"
else:
query_type = re.search(r'\n\s+Query Type:\s+(.*?)\n', profile_text)
assert query_type is not None
assert value == query_type.group(1), "query type incorrect"
query_type = query_type.group(1)
# Client Network Address
network_address = re.search(r'\n\s+Network Address:\s+(.*?)\n', profile_text)
assert network_address is not None
assert column_val(TQueryTableColumn.NETWORK_ADDRESS) == network_address.group(1),\
"network address incorrect"
# offset from UTC
utc_now = datetime.utcnow().replace(microsecond=0, second=0)
local_now = datetime.now().replace(microsecond=0, second=0)
utc_offset = utc_now - local_now
# Start Time
start_time = re.search(r'\n\s+Start Time:\s+(.*?)\n', profile_text)
assert start_time is not None
start_time_obj = datetime.strptime(start_time.group(1)[:-3], QUERY_PROFILE_DT_FORMAT)
start_time_obj_utc = start_time_obj + utc_offset
expected = start_time_obj_utc.strftime(QUERY_PROFILE_DT_FORMAT)
actual = column_val(TQueryTableColumn.START_TIME_UTC)[:-3]
assert actual == expected, "start time incorrect, expected '{}' but was '{}'" \
.format(expected, actual)
# End Time (not in table, but needed for duration calculation)
end_time = re.search(r'\n\s+End Time:\s+(.*?)\n', profile_text)
assert end_time is not None
end_time_obj = datetime.strptime(end_time.group(1)[:-3], QUERY_PROFILE_DT_FORMAT)
# Query Duration (allow values that are within 1 second)
value = column_val(TQueryTableColumn.TOTAL_TIME_MS)
duration = end_time_obj - start_time_obj
min_allowed = round_to_3(duration.total_seconds() * 1000 * 0.999)
max_allowed = round_to_3(duration.total_seconds() * 1000 * 1.001)
assert min_allowed <= float(value) <= max_allowed, "total time incorrect"
# Query Options Set By Configuration
value = column_val(TQueryTableColumn.QUERY_OPTS_CONFIG)
if query_state_value == "EXCEPTION":
assert value != "", "query options set by config incorrect"
else:
query_opts = re.search(r'\n\s+Query Options \(set by configuration\):\s+(.*?)\n',
profile_text)
assert query_opts is not None
assert value == query_opts.group(1).replace("&apos;", "'"), \
"query opts set by config incorrect"
# Resource Pool
value = column_val(TQueryTableColumn.RESOURCE_POOL)
if query_state_value == "EXCEPTION":
assert value == "", "resource pool incorrect"
else:
if query_type != "DDL":
req_pool = re.search(r'\n\s+Request Pool:\s+(.*?)\n', profile_text)
assert req_pool is not None
assert value == req_pool.group(1), "request pool incorrect"
else:
assert value == "", "request pool not empty"
# Per-host Memory Estimate
value = column_val(TQueryTableColumn.PER_HOST_MEM_ESTIMATE)
if query_state_value == "EXCEPTION" or query_type == "DDL":
assert value == "0", "per-host memory estimate incorrect"
else:
# First check the Estimated Per-Host Mem from the query profile. This value may not
# match though because certain query options can cause this value to diverge from the
# per-host memory estimate stored in the query history table.
est_perhost_mem = re.search(r'\n\s+Estimated Per-Host Mem:\s+(\d+)\n', profile_text)
assert est_perhost_mem is not None
if est_perhost_mem.group(1) != value:
# The profile and db values diverged, use the Per-Host Resource Estimates field from
# the query profile as the expected value. Since query profile value is an estimate,
# it's not as good to use, but it's all we have available.
perhost_mem_est = re.search(r'\nPer-Host Resource Estimates:\s+Memory\=(.*?)\n',
profile_text)
assert perhost_mem_est is not None
assert_byte_str(expected_str=perhost_mem_est.group(1), actual_bytes=value,
msg="per-host memory estimate incorrect", unit_combined=True)
# Dedicated Coordinator Memory Estimate
# This value is different because it is the minimum of the query option
# MAX_MEM_ESTIMATE_FOR_ADMISSION or a calculation that includes a 100mb buffer.
# Thus, callers must specify if the query being asserted had that option set.
value = column_val(TQueryTableColumn.DEDICATED_COORD_MEM_ESTIMATE)
if query_state_value == "EXCEPTION" or query_type == "DDL":
assert value == "0", "dedicated coordinator memory estimate incorrect"
elif query_type == "DML":
assert value == str(DEDICATED_COORD_SAFETY_BUFFER_BYTES), \
"dedicated coordinator memory estimate incorrect"
else:
if max_mem_for_admission is not None:
# The MAX_MEM_ESTIMATE_FOR_ADMISSION query option was specified, thus that should
# be the value that was written to the database.
assert str(max_mem_for_admission) == value, \
"dedicated coordinator memory estimate incorrect"
else:
root_mem = re.search(r'\n\nF\d+:PLAN FRAGMENT.*?mem-estimate=(\S+?) mem',
profile_text, re.DOTALL)
assert root_mem is not None, "dedicated coordinator memory estimate incorrect"
buffer = DEDICATED_COORD_SAFETY_BUFFER_BYTES
assert_byte_str(expected_str=root_mem.group(1),
actual_bytes=int(value) - buffer,
msg="dedicated coordinator memory estimate incorrect", unit_combined=True)
# Per-Host Fragment Instances
value = column_val(TQueryTableColumn.PER_HOST_FRAGMENT_INSTANCES)
if query_state_value == "EXCEPTION" or query_type == "DDL":
assert value == "", "per-host fragment instances incorrect"
else:
perhost_frags = re.search(r'\n\s+Per Host Number of Fragment Instances:\s+(.*?)\n',
profile_text)
assert perhost_frags is not None
expected = ",".join(sorted(perhost_frags.group(1).replace("(", "=")
.replace(")", "").split(" ")))
assert value == expected, ('per-host fragment instances incorrect.'
' expected="{0}" actual="{1}"').format(expected, value)
# Backends Count
value = column_val(TQueryTableColumn.BACKENDS_COUNT)
num_bck = re.search(r'\n\s+\- NumBackends:\s+(\d+)', profile_text)
if query_state_value == "EXCEPTION" or query_type == "DDL":
assert num_bck is None
assert value == "0", "backends count incorrect"
else:
assert num_bck is not None
assert value == num_bck.group(1), "backends count incorrect"
# Admission Result
value = column_val(TQueryTableColumn.ADMISSION_RESULT)
adm_result = re.search(r'\n\s+Admission result:\s+(.*?)\n', profile_text)
if query_state_value == "EXCEPTION" or query_type == "DDL":
assert adm_result is None
assert value == "", "admission result incorrect"
else:
assert adm_result is not None
assert value == adm_result.group(1), "admission result incorrect"
# Cluster Memory Admitted
value = column_val(TQueryTableColumn.CLUSTER_MEMORY_ADMITTED)
clust_mem = re.search(r'\n\s+Cluster Memory Admitted:\s+(.*?)\n', profile_text)
if query_state_value == "EXCEPTION":
assert clust_mem is None
else:
if query_type != "DDL":
assert clust_mem is not None
assert_byte_str(expected_str=clust_mem.group(1), actual_bytes=value,
msg="cluster memory admitted incorrect")
else:
assert value == "0", "cluster memory not zero"
# Executor Group
value = column_val(TQueryTableColumn.EXECUTOR_GROUP)
exec_group = re.search(r'\n\s+Executor Group:\s+(.*?)\n', profile_text)
if query_state_value == "EXCEPTION" or query_type == "DDL":
assert exec_group is None
assert value == "", "executor group should not have been found"
else:
assert exec_group is not None
assert value == exec_group.group(1), "executor group incorrect"
# Executor Groups
value = column_val(TQueryTableColumn.EXECUTOR_GROUPS)
# The following regular expression matches both named and unnamed executor groups in the
# query profile. For example, both of the following lines will match this regex:
# Executor group 1 (small):
# Executor group 1:
exec_groups = re.search(r'\n\s+(Executor group \d+(?:\s+\(\w+\))?:.*?)\n\s+PlannerInfo',
profile_text, re.DOTALL)
if query_state_value == "EXCEPTION":
assert exec_groups is None, "executor groups should not have been found"
else:
assert exec_groups is not None
dedent_str = re.sub(r'^\s{6}', '', exec_groups.group(1), flags=re.MULTILINE)
assert value == dedent_str, "executor groups incorrect"
# Exec Summary
value = column_val(TQueryTableColumn.EXEC_SUMMARY)
exec_sum = re.search(r'\n\s+ExecSummary:\s*\n(.*)\n\s+Errors', profile_text, re.DOTALL)
if query_state_value == "EXCEPTION" or query_type == "DDL":
assert exec_sum is None
assert value == ""
else:
assert exec_sum is not None
assert value == exec_sum.group(1)
# Rows Fetched
value = column_val(TQueryTableColumn.NUM_ROWS_FETCHED)
rows_fetched = re.search(r'\n\s+\-\s+NumRowsFetched:\s+\S+\s+\((\d+)\)', profile_text)
if query_state_value == "EXCEPTION":
assert rows_fetched is None
else:
assert rows_fetched is not None
assert value == rows_fetched.group(1)
# Row Materialization Rate
value = column_val(TQueryTableColumn.ROW_MATERIALIZATION_ROWS_PER_SEC)
if query_state_value == "EXCEPTION" or query_type == "DDL" or query_type == 'DML':
assert value == "0", "row materialization rate incorrect"
else:
row_mat = re.search(r'\n\s+\-\s+RowMaterializationRate:\s+(\S+)\s+([MK])?',
profile_text)
assert row_mat is not None
tolerance = int(value) * 0.005
expected_row_mat = 0
if row_mat.group(2) == "K":
expected_row_mat = int(float(row_mat.group(1)) * 1000)
elif row_mat.group(2) == "M":
expected_row_mat = int(float(row_mat.group(1)) * 1000000)
else:
expected_row_mat = int(float(row_mat.group(1)))
assert expected_row_mat - tolerance <= int(value) \
<= expected_row_mat + tolerance, "row materialization rate incorrect"
# Row Materialization Time
value = column_val(TQueryTableColumn.ROW_MATERIALIZATION_TIME_MS)
row_mat_tmr = re.search(r'\n\s+\-\s+RowMaterializationTimer:\s+(.*?)\n', profile_text)
if query_state_value == "EXCEPTION":
assert row_mat_tmr is None
elif query_type == "DDL" or query_type == 'DML':
assert row_mat_tmr is not None
assert row_mat_tmr.group(1) == "0.000ns", "row materialization timer incorrect"
else:
assert row_mat_tmr is not None
assert_time_str(row_mat_tmr.group(1), value,
"row materialization time incorrect")
# Compressed Bytes Spilled
scratch_bytes_total = 0
for sbw in re.findall(r'\n\s+\-\s+ScratchBytesWritten:.*?\((\d+)\)', profile_text):
scratch_bytes_total += int(sbw)
assert int(column_val(TQueryTableColumn.COMPRESSED_BYTES_SPILLED)) \
== scratch_bytes_total
# Parse out only the query timeline.
timeline = re.search(r'\n\s+Query Timeline:(.*?)\n\s+Frontend', profile_text, re.DOTALL)
assert timeline is not None, "query timeline not found"
timeline = timeline.group(1)
# Event Timeline Planning Finished
value = column_val(TQueryTableColumn.EVENT_PLANNING_FINISHED)
if query_state_value == "EXCEPTION":
assert value == "0.000", "planning finished event incorrect"
else:
event = re.search(r'\n\s+\-\s+Planning finished:\s+(\S+)', timeline)
assert event is not None, "planning finished event missing"
assert_time_str(event.group(1), value, "planning finished event incorrect")
# Event Timeline Submit for Admission
value = column_val(TQueryTableColumn.EVENT_SUBMIT_FOR_ADMISSION)
if query_state_value == "EXCEPTION" or query_type == "DDL":
assert value == "0.000", "submit for admission event incorrect"
else:
event = re.search(r'\n\s+\-\s+Submit for admission:\s+(\S+)', timeline)
assert event is not None, "submit for admission event missing"
assert_time_str(event.group(1), value, "submit for admission event incorrect")
# Event Timeline Completed Admission
value = column_val(TQueryTableColumn.EVENT_COMPLETED_ADMISSION)
if query_state_value == "EXCEPTION" or query_type == "DDL":
assert value == "0.000", "completed admission event incorrect"
else:
event = re.search(r'\n\s+\-\s+Completed admission:\s+(\S+)', timeline)
assert event is not None, "completed admission event missing"
assert_time_str(event.group(1), value, "completed admission event incorrect")
# Event Timeline All Backends Started
value = column_val(TQueryTableColumn.EVENT_ALL_BACKENDS_STARTED)
if query_state_value == "EXCEPTION" or query_type == "DDL":
assert value == "0.000", "all backends started event incorrect"
else:
event = re.search(r'\n\s+\-\s+All \d+ execution backends \(\d+ fragment instances\)'
r' started:\s+(\S+)', timeline)
assert event is not None, "all backends started event missing"
assert_time_str(event.group(1), value, "all backends started event incorrect")
# Event Timeline Rows Available
value = column_val(TQueryTableColumn.EVENT_ROWS_AVAILABLE)
if query_state_value == "EXCEPTION" or query_type == "DML":
assert value == "0.000", "rows available event incorrect"
else:
event = re.search(r'\n\s+\-\s+Rows available:\s+(\S+)', timeline)
assert event is not None, "rows available event missing"
assert_time_str(event.group(1), value, "rows available event incorrect")
# Event Timeline First Row Fetched
value = column_val(TQueryTableColumn.EVENT_FIRST_ROW_FETCHED)
if query_state_value == "EXCEPTION" or query_type == "DDL" or query_type == "DML":
assert value == "0.000", "first row fetched event incorrect"
else:
event = re.search(r'\n\s+\-\s+First row fetched:\s+(\S+)', timeline)
assert event is not None, "first row fetched event missing"
assert_time_str(event.group(1), value, "first row fetched event incorrect")
# Event Timeline Last Row Fetched
value = column_val(TQueryTableColumn.EVENT_LAST_ROW_FETCHED)
if query_state_value == "EXCEPTION" or query_type == "DDL":
assert value == "0.000", "last row fetched event incorrect"
else:
event = re.search(r'\n\s+\-\s+Last row fetched:\s+(\S+)', timeline)
assert event is not None, "last row fetched event missing"
assert_time_str(event.group(1), value, "last row fetched event incorrect")
# Event Timeline Unregister Query
value = column_val(TQueryTableColumn.EVENT_UNREGISTER_QUERY)
event = re.search(r'\n\s+\-\s+Unregister query:\s+(\S+)', timeline)
assert event is not None, "unregister query event missing"
assert_time_str(event.group(1), value, "unregister query event incorrect")
# Read IO Wait Total
value = column_val(TQueryTableColumn.READ_IO_WAIT_TOTAL_MS)
total_read_wait = 0
if (query_state_value != "EXCEPTION" and query_type == "QUERY") or value != "0":
re_wait_time = re.compile(r'^\s+\-\s+ScannerIoWaitTime:\s+(.*?)$')
read_waits = assert_scan_node_metrics(re_wait_time, profile_lines)
for r in read_waits:
total_read_wait += convert_to_milliseconds(r)
tolerance = total_read_wait * 0.001
assert total_read_wait - tolerance <= float(value) <= \
total_read_wait + tolerance, "read io wait time total incorrect"
else:
assert value == "0.000"
# Read IO Wait Average
value = column_val(TQueryTableColumn.READ_IO_WAIT_MEAN_MS)
if (query_state_value != "EXCEPTION" and query_type == "QUERY"
and len(read_waits) != 0) or value != "0.000":
avg_read_wait = round_to_3(float(total_read_wait / len(read_waits)))
assert avg_read_wait - tolerance <= float(value) <= avg_read_wait + tolerance, \
"read io wait time average incorrect"
else:
assert value == "0.000"
# Total Bytes Read From Cache
value = column_val(TQueryTableColumn.BYTES_READ_CACHE_TOTAL)
if (query_state_value != "EXCEPTION" and query_type == "QUERY") or value != "0":
re_cache_read = re.compile(r'^\s+\-\s+DataCacheHitBytes:\s+.*?\((\d+)\)$')
read_from_cache = assert_scan_node_metrics(re_cache_read, profile_lines)
total_read = 0
for r in read_from_cache:
total_read += int(r)
assert total_read == int(value), "bytes read from cache total incorrect"
else:
assert value == "0"
# Total Bytes Read
value = column_val(TQueryTableColumn.BYTES_READ_TOTAL)
bytes_read = re.search(r'\n\s+\-\s+TotalBytesRead:\s+.*?\((\d+)\)\n', profile_text)
if query_state_value != "EXCEPTION" and query_type == "QUERY":
assert bytes_read is not None, "total bytes read missing"
if bytes_read is not None:
assert value == bytes_read.group(1), "total bytes read incorrect"
# Calculate all peak memory usage stats by scraping the query profile.
peak_mem_cnt = 0
min_peak_mem = 0
max_peak_mem = 0
total_peak_mem = 0
for peak_mem in re.findall(r'\n\s+Per Node Peak Memory Usage:(.*?)\n', profile_text):
for node in re.findall(r'\s+.*?:\d+\((.*?)\)', peak_mem):
peak_mem_cnt += 1
conv = convert_to_bytes(node)
total_peak_mem += conv
if conv < min_peak_mem or min_peak_mem == 0:
min_peak_mem = conv
if conv > max_peak_mem:
max_peak_mem = conv
if query_state_value != "EXCEPTION" and query_type != "DDL":
assert peak_mem_cnt > 0, "did not find per node peak memory usage"
# Per Node Peak Memory Usage Min
value = column_val(TQueryTableColumn.PERNODE_PEAK_MEM_MIN)
tolerance = int(min_peak_mem * 0.005)
assert min_peak_mem - tolerance <= int(value) <= min_peak_mem + tolerance, \
"pernode peak memory minimum incorrect"
# Per Node Peak Memory Usage Max
value = column_val(TQueryTableColumn.PERNODE_PEAK_MEM_MAX)
tolerance = int(max_peak_mem * 0.005)
assert max_peak_mem - tolerance <= int(value) <= max_peak_mem + tolerance, \
"pernode peak memory maximum incorrect"
# Per Node Peak Memory Usage Mean
value = column_val(TQueryTableColumn.PERNODE_PEAK_MEM_MEAN)
mean_peak_mem = 0
if peak_mem_cnt > 0:
mean_peak_mem = int(total_peak_mem / peak_mem_cnt)
tolerance = int(max_peak_mem * 0.005)
assert mean_peak_mem - tolerance <= int(value) <= mean_peak_mem + tolerance, \
"pernode peak memory mean incorrect"
# SQL statement
sql_stmt = re.search(r'\n\s+Sql Statement:\s+(.*?)\n', profile_text)
assert sql_stmt is not None
assert column_val(TQueryTableColumn.SQL) == sql_stmt.group(1), "sql incorrect"
# Query Plan
value = column_val(TQueryTableColumn.PLAN)
plan = re.search(r'\n\s+Plan:\s*\n(.*)\n\s+Estimated Per-Host Mem', profile_text,
re.DOTALL)
if query_state_value == "EXCEPTION" or query_type == "DDL":
assert plan is None
assert value == ""
else:
assert plan is not None
assert value == plan.group(1)
# Tables Queried
value = column_val(TQueryTableColumn.TABLES_QUERIED)
tables = re.search(r'\n\s+Tables Queried:\s+(.*?)\n', profile_text)
if query_state_value == "EXCEPTION" or query_type == "DDL":
assert tables is None
assert value == ""
else:
assert tables is not None
assert value == tables.group(1)
# Select Columns
assert_col(TQueryTableColumn.SELECT_COLUMNS, r'\n\s+Select Columns:\s+(.*?)\n')
# Where Columns
assert_col(TQueryTableColumn.WHERE_COLUMNS, r'\n\s+Where Columns:\s+(.*?)\n')
# Join Columns
assert_col(TQueryTableColumn.JOIN_COLUMNS, r'\n\s+Join Columns:\s+(.*?)\n')
# Aggregate Columns
assert_col(TQueryTableColumn.AGGREGATE_COLUMNS, r'\n\s+Aggregate Columns:\s+(.*?)\n')
# OrderBy Columns
assert_col(TQueryTableColumn.ORDERBY_COLUMNS, r'\n\s+OrderBy Columns:\s+(.*?)\n')
# Coordinator and Executor Slots Columns
admission_slots = re.findall(
r'\n\s+\-\s+AdmissionSlots:\s+(\d*?)\s+.*?\n', profile_text)
value = column_val(TQueryTableColumn.COORDINATOR_SLOTS)
if TQueryTableColumn.COORDINATOR_SLOTS in expected_overrides:
assert value == expected_overrides[TQueryTableColumn.COORDINATOR_SLOTS]
else:
# The first host has the coordinator admission slots.
expected_coordinator_slots = admission_slots[0] if len(admission_slots) > 0 else "0"
assert value == expected_coordinator_slots
value = column_val(TQueryTableColumn.EXECUTOR_SLOTS)
if TQueryTableColumn.EXECUTOR_SLOTS in expected_overrides:
assert value == expected_overrides[TQueryTableColumn.EXECUTOR_SLOTS]
else:
# Take executor admission slots from the second impalad.
expected_executor_slots = admission_slots[1] if len(admission_slots) > 1 else "0"
assert value == expected_executor_slots
return ret_data
# function assert_query
def assert_scan_node_metrics(re_metric, profile_lines):
"""Retrieves metrics reported under HDFS_SCAN_NODEs removing any metrics from
Averaged Fragments. The provided re_metric must be a compiled regular expression
with at least one capture group. Returns a list of the contents of the first
capture group in the re_metrics regular expression for all matching metrics."""
metrics = []
re_in_scan = re.compile(r'^\s+HDFS_SCAN_NODE')
re_avg_fgmt = re.compile(r'^(\s+)Averaged Fragment')
in_scan = False
in_avg_fgmt = 0
for line in profile_lines:
avg_fmt_res = re_avg_fgmt.search(line)
if avg_fmt_res is not None:
# Averaged Fragments sometimes have HDFS_SCAN_NODEs which must be skipped.
in_avg_fgmt = len(avg_fmt_res.group(1))
elif in_avg_fgmt > 0 and line[in_avg_fgmt + 1] != " ":
# Found a line at the same indentation as the previous Averaged Fragement, thus
# we successfully skipped over any HDFS_SCAN_NODEs if they existed.
in_avg_fgmt = 0
elif in_avg_fgmt == 0 and re_in_scan.match(line) is not None:
# Found a HDFS_SCAN_NODE that was not under an Averaged Fragment.
in_scan = True
elif in_scan:
# Search through the HDFS_SCAN_NODE for the metric.
res = re_metric.search(line)
if res is not None:
metrics.append(res.group(1))
in_scan = False
return metrics
# function assert_scan_node_metrics
def assert_csv_col(client, query_tbl, col, query_id, expected_list, db="tpcds"):
"""Asserts that a single column that contains a string of comma separated values
matches a list of expected values. Order of elements does not matter."""
print("Query Id: {0}".format(query_id))
success = False
sql_results = None
start_time = time()
while (time() - start_time <= ASSERT_QUERY_TIMEOUT_S):
# Force Impala to process the inserts to the completed queries table.
if query_tbl != QUERY_TBL_LIVE:
client.execute("refresh " + query_tbl)
# Assert the query was written correctly to the query log table.
sql_results = client.execute("select * from {0} where query_id='{1}'".format(
query_tbl, query_id))
if sql_results.success and len(sql_results.data) == 1:
success = True
break
# Query is not yet available in the workload management table, wait and try again.
sleep(1)
assert success, "Did not find query '{}' in the '{}' table after multiple attempts" \
.format(query_id, query_tbl)
data = sql_results.data[0].split("\t")
actual = []
if len(data[col]) > 0:
actual = data[col].split(",")
# Prepend the database to the beginning of each item in the expected_list.
if db is not None:
expected_list = list(map(lambda item: "{}.{}".format(db, item), expected_list))
assert len(actual) == len(expected_list), "Column '{}' for query '{}' had different "\
"lengths between the actual and expected lists:\n actual (length {}): {}\n " \
"expected (length {}): {}".format(TQueryTableColumn._VALUES_TO_NAMES[col], query_id,
len(actual), sorted(actual), len(expected_list), sorted(expected_list))
for expected in expected_list:
assert expected in actual, "Column '{}' for query '{}' was missing expected value " \
"'{}'\n actual (length {}): {}\n expected (length {}): {}" \
.format(TQueryTableColumn._VALUES_TO_NAMES[col], query_id, expected, len(actual),
sorted(actual), len(expected_list), sorted(expected_list))
def redaction_rules_file():
"""Provides the path to a redaction file that redacts the word
'supercalifragilisticexpialidocious'."""
return "{}/testdata/workload_mgmt/redaction.json".format(os.environ["IMPALA_HOME"])