# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. # # Common for connections to Impala. Currently supports Beeswax connections and # in the future will support HS2 connections. Provides tracing around all # operations. from __future__ import absolute_import, division, print_function import abc import getpass import logging import re import time from future.utils import with_metaclass import impala.dbapi as impyla import impala.error as impyla_error import impala.hiveserver2 as hs2 from impala_thrift_gen.beeswax.BeeswaxService import QueryState from impala_thrift_gen.Query.ttypes import TQueryOptions from impala_thrift_gen.RuntimeProfile.ttypes import TRuntimeProfileFormat from tests.beeswax.impala_beeswax import ( DEFAULT_SLEEP_INTERVAL, ImpalaBeeswaxClient, ImpalaBeeswaxException, ) import tests.common from tests.common.patterns import LOG_FORMAT from tests.common.test_vector import BEESWAX, HS2, HS2_HTTP from tests.util.thrift_util import op_handle_to_query_id, session_handle_to_session_id LOG = logging.getLogger(__name__) console_handler = logging.StreamHandler() console_handler.setLevel(logging.INFO) # All logging needs to be either executable SQL or a SQL comment (prefix with --). console_handler.setFormatter(logging.Formatter(LOG_FORMAT)) LOG.addHandler(console_handler) LOG.propagate = False # Regular expression that matches the "progress" entry in the HS2 log. PROGRESS_LOG_RE = re.compile( r'^Query [a-z0-9:]+ [0-9]+% Complete \([0-9]+ out of [0-9]+\)$') MAX_SQL_LOGGING_LENGTH = 128 * 1024 # Tuple of root exception types from different client protocol. IMPALA_CONNECTION_EXCEPTION = (ImpalaBeeswaxException, impyla_error.Error) # String representation of ClientRequestState::ExecState INITIALIZED = 'INITIALIZED' PENDING = 'PENDING' RUNNING = 'RUNNING' FINISHED = 'FINISHED' ERROR = 'ERROR' # ExecState that is final. EXEC_STATES_FINAL = set([FINISHED, ERROR]) # Possible ExecState after query passed admission controller. EXEC_STATES_ADMITTED = set([RUNNING, FINISHED, ERROR]) # Mapping of a ExecState to a set of possible future ExecState. LEGAL_FUTURE_STATES = { INITIALIZED: set([PENDING, RUNNING, FINISHED, ERROR]), PENDING: set([RUNNING, FINISHED, ERROR]), RUNNING: set([FINISHED, ERROR]), FINISHED: set([ERROR]), ERROR: set() } def has_legal_future_state(impala_state, future_states): """Return True if 'impala_state' can transition to one of state listed in 'future_states'.""" assert impala_state in LEGAL_FUTURE_STATES expected_impala_states = set(future_states) return len(LEGAL_FUTURE_STATES[impala_state] & expected_impala_states) > 0 # test_exprs.py's TestExprLimits executes extremely large SQLs (multiple MBs). It is the # only test that runs SQL larger than 128KB. Logging these SQLs in execute() increases # the size of the JUnitXML files, causing problems for users of JUnitXML like Jenkins. # This function limits the size of the returned string if it is larger than 128KB. def format_sql_for_logging(sql_stmt): """If the 'sql_stmt' is shorter than MAX_SQL_LOGGING_LENGTH, only wrap sql_stmt with new lines and semicolon. If it is larger than MAX_SQL_LOGGING_LENGTH, truncate it and comment it out. This function returns a unicode string.""" # sql_stmt could contain Unicode characters, so explicitly use unicode literals # so that Python 2 works. if (len(sql_stmt) <= MAX_SQL_LOGGING_LENGTH): return u"\n{0};\n".format(sql_stmt) else: # The logging output should be valid SQL, so the truncated SQL is commented out. truncated_sql = u'\n--'.join( [line for line in sql_stmt[0:MAX_SQL_LOGGING_LENGTH].split("\n")]) return (u"\n-- Skip logging full SQL statement of length {0}" u"\n-- Logging a truncated version, commented out:" u"\n-- {1}" u"\n-- [...]\n").format(len(sql_stmt), truncated_sql) def build_summary_table_from_thrift(thrift_exec_summary): from shell.exec_summary import build_exec_summary_table result = list() build_exec_summary_table(thrift_exec_summary, 0, 0, False, result, is_prettyprint=False, separate_prefix_column=True) keys = ['prefix', 'operator', 'num_hosts', 'num_instances', 'avg_time', 'max_time', 'num_rows', 'est_num_rows', 'peak_mem', 'est_peak_mem', 'detail'] output = list() for row in result: assert len(keys) == len(row) summ_map = dict(zip(keys, row)) output.append(summ_map) return output def collect_default_query_options(options, name, val): name = name.lower() if val is not None: val = str(val).strip('"') if ',' in val or '/' in val: # Value is a list or a timezone name containing a slash. Wrap it with double quotes. val = '"{}"'.format(val) if not val: # Value is optional with None as default or just turned into an empty string. val = '""' options[name] = val def parse_query_options_from_thrift(): """Populate 'options' map with default query options parsed from TQueryOptions attributes.""" result = dict() tquery_opts = TQueryOptions() for key in dir(tquery_opts): non_opts_attrs = ['read', 'write', 'validate', 'thrift_spec'] if not key.startswith('_') and key not in non_opts_attrs: value = getattr(tquery_opts, key) if isinstance(value, set): # The default value of some query options, e.g., # enabled_runtime_filter_types, can be a set of integer. # Turn the set into comma separated values. value = ','.join([str(v) for v in value]) # No need to supply 'kind' since TQueryOptions already exclude # removed query options. collect_default_query_options(result, key, value) return result # A map of default query option obtained from TQueryOptions. # Query option names (the keys) are in lower case string for consistency. # Values are all strings and might be double-quoted, making it legal for both setting # through 'SET' query or ImpalaConnection.set_configuration_option(). DEFAULT_QUERY_OPTIONS = parse_query_options_from_thrift() # Common wrapper around the internal types of HS2/Beeswax operation/query handles. class OperationHandle(object): def __init__(self, handle, sql_stmt): self.__handle = handle self.__sql_stmt = sql_stmt def get_handle(self): return self.__handle def sql_stmt(self): return self.__sql_stmt # Represents an Impala connection. class ImpalaConnection(with_metaclass(abc.ABCMeta, object)): def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): self.close() @abc.abstractmethod def get_test_protocol(self): """Return client protocol name that is specific to Impala test framework. Possible return value are either of 'beeswax', 'hs2', or 'hs2-http'.""" pass @abc.abstractmethod def get_host_port(self): """Return the 'host:port' string of impala server that this object connecting to.""" pass @abc.abstractmethod def set_configuration_option(self, name, value, is_log_sql=True): """Sets a configuration option name to the given value. Return True if option is changing. Otherwise, return False (option already has the same value). If is_log_sql True, log the equivalent SET query to INFO. Do note though that the option change does not actually happen by issuing SET query.""" pass def set_configuration(self, config_option_dict): """Replaces existing configuration with the given dictionary. If config_option_dict is an empty dictionary, simply clear current client configuration.""" assert isinstance(config_option_dict, dict), \ "config_option_dict must be a dictionary" self.clear_configuration() if not config_option_dict: return log_lines = list() for name, value in config_option_dict.items(): if self.set_configuration_option(name, value, False): log_lines.append("set {0}={1};".format(name, value)) if log_lines: self.log_client("set_configuration:\n\n{}\n".format('\n'.join(log_lines))) @abc.abstractmethod def clear_configuration(self): """Clears all existing configuration.""" pass def get_default_configuration(self): """Return the default configuration for the connection, before any modifications are made to the session state. Returns a map with the config variable as the key and a string representation of the default value as the value.""" return DEFAULT_QUERY_OPTIONS.copy() @abc.abstractmethod def connect(self): """Opens the connection""" pass @abc.abstractmethod def close(self): """Closes the connection. Can be called multiple times""" pass @abc.abstractmethod def close_query(self, handle, fetch_profile_after_close=False): """Closes the query.""" pass @abc.abstractmethod def get_state(self, operation_handle): """Returns the state of a query. May raise en error, depending on connection type.""" pass @abc.abstractmethod def get_impala_exec_state(self, operation_handle): """Returns a string translation from client specific state of operation_handle to Impala's ClientRequestState::ExecState.""" pass def __is_at_exec_state(self, operation_handle, impala_state): self.log_handle( operation_handle, 'checking ' + impala_state + ' state for operation') return self.get_impala_exec_state(operation_handle) == impala_state def state_is_finished(self, operation_handle): """Returns whether the Impala exec state of a operation_handle is FINISHED. DEPRECATED: use is_finished() instead.""" return self.is_finished(operation_handle) def is_initialized(self, operation_handle): """Returns whether the Impala exec state of a operation_handle is INITIALIZED""" return self.__is_at_exec_state(operation_handle, INITIALIZED) def is_pending(self, operation_handle): """Returns whether the Impala exec state of a operation_handle is PENDING""" return self.__is_at_exec_state(operation_handle, PENDING) def is_running(self, operation_handle): """Returns whether the Impala exec state of a operation_handle is RUNNING""" return self.__is_at_exec_state(operation_handle, RUNNING) def is_finished(self, operation_handle): """Returns whether the Impala exec state of a operation_handle is FINISHED""" return self.__is_at_exec_state(operation_handle, FINISHED) def is_error(self, operation_handle): """Returns whether the Impala exec state of a operation_handle is ERROR. Internally, it will call get_state(), and any exception thrown by get_state() will cause this method to return True.""" return self.__is_at_exec_state(operation_handle, ERROR) def is_executing(self, operation_handle): """Returns whether the state of a operation_handle is executing or will be executing. Return False if operation_handle has ended, either successful or with error.""" return self.get_impala_exec_state(operation_handle) not in EXEC_STATES_FINAL def is_admitted(self, operation_handle): """Returns whether the state of a operation_handle has passed Impala admission control. Return True if handle state is error.""" return self.get_impala_exec_state(operation_handle) in EXEC_STATES_ADMITTED @abc.abstractmethod def get_log(self, operation_handle): """Returns the log of an operation as a string, with entries separated by newlines.""" pass @abc.abstractmethod def cancel(self, operation_handle): """Cancels an in-flight operation""" pass def execute(self, sql_stmt, user=None, fetch_profile_after_close=False, # noqa: U100 fetch_exec_summary=False, # noqa: U100 profile_format=TRuntimeProfileFormat.STRING): # noqa: U100 """Executes a query and fetches the results""" pass @abc.abstractmethod def execute_async(self, sql_stmt): """Issues a query and returns the handle to the caller for processing. Only one async operation per connection at a time is supported, due to limitations of the Beeswax protocol and the Impyla client.""" pass @abc.abstractmethod def fetch(self, sql_stmt, operation_handle, max_rows=-1, discard_results=False): """Fetches query results up to max_rows given a handle and sql statement. Caller must ensure that query has passed PENDING state before calling fetch. If max_rows < 0, all rows are fetched. If max_rows > 0 but the number of rows returned is less than max_rows, all the rows have been fetched. Return None if discard_results is True. TODO: 'sql_stmt' can be obtained from 'operation_handle'.""" pass @abc.abstractmethod def get_runtime_profile(self, operation_handle, profile_format=TRuntimeProfileFormat.STRING): """Get runtime profile of given 'operation_handle'. Handle must stay open.""" pass @abc.abstractmethod def handle_id(self, operation_handle): """Return a string id for given operation_handle. Most implementations will return an Impala query id for given handle. Otherwise, return str(operation_handle).""" pass def log_handle(self, operation_handle, message): """Log 'message' at INFO level, along with id of 'operation_handle'.""" handle_id = self.handle_id(operation_handle) LOG.info(u"{0}: {1}".format(handle_id, message)) def log_client(self, message): """Log 'message' at INFO level, prefixed wih the protocol name of this connection.""" LOG.info(u"{0}: {1}".format(self.get_test_protocol(), message)) def wait_for_impala_state(self, operation_handle, expected_impala_state, timeout): """Waits for the given 'operation_handle' to reach the 'expected_impala_state'. 'expected_impala_state' must be a string of either 'INITIALIZED', 'PENDING', 'RUNNING', 'FINISHED', or 'ERROR'. If it does not reach the given state within 'timeout' seconds, the method throws an AssertionError. """ self.wait_for_any_impala_state(operation_handle, [expected_impala_state], timeout) def wait_for_any_impala_state(self, operation_handle, expected_impala_states, timeout_s): """Waits for the given 'operation_handle' to reach one of 'expected_impala_states'. Each string in 'expected_impala_states' must either be 'INITIALIZED', 'PENDING', 'RUNNING', 'FINISHED', or 'ERROR'. If it does not reach one of the given states within 'timeout' seconds, the method throws an AssertionError. Returns the final state. """ start_time = time.time() timeout_msg = None while True: impala_state = self.get_impala_exec_state(operation_handle) interval = time.time() - start_time if impala_state in expected_impala_states: # Reached one of expected_impala_states. break elif not has_legal_future_state(impala_state, expected_impala_states): timeout_msg = ("query '{0}' can not transition from last known state '{1}' to " "any of the expected states {2}. Stop waiting after {3} " "seconds.").format( self.handle_id(operation_handle), impala_state, expected_impala_states, interval) break elif interval >= timeout_s: timeout_msg = ("query '{0}' did not reach one of the expected states {1}, last " "known state {2}").format( self.handle_id(operation_handle), expected_impala_states, impala_state) break time.sleep(DEFAULT_SLEEP_INTERVAL) if timeout_msg is not None: raise tests.common.errors.Timeout(timeout_msg) return impala_state @abc.abstractmethod def wait_for_admission_control(self, operation_handle, timeout_s=60): """Given an 'operation_handle', polls the coordinator waiting for it to complete admission control processing of the query. Return True if query pass admission control after given 'timeout_s'.""" pass @abc.abstractmethod def get_admission_result(self, operation_handle): """Given an 'operation_handle', returns the admission result from the query profile""" pass @abc.abstractmethod def get_exec_summary(self, operation_handle): # noqa: U100 pass def get_exec_summary_table(self, operation_handle): summary_table = list() summary = self.get_exec_summary(operation_handle) if summary: summary_table = build_summary_table_from_thrift(summary) return summary_table # Represents a connection to Impala using the Beeswax API. class BeeswaxConnection(ImpalaConnection): # This is based on ClientRequestState::BeeswaxQueryState(). __QUERY_STATE_TO_EXEC_STATE = { QueryState.CREATED: INITIALIZED, QueryState.COMPILED: PENDING, QueryState.RUNNING: RUNNING, QueryState.FINISHED: FINISHED, QueryState.EXCEPTION: ERROR, # These are not official ExecState, but added to complete mapping. QueryState.INITIALIZED: 'UNIMPLEMENTED_INITIALIZED', } def __init__(self, host_port, use_kerberos=False, user=None, password=None, use_ssl=False): self.__beeswax_client = ImpalaBeeswaxClient(host_port, use_kerberos, user=user, password=password, use_ssl=use_ssl) self.__host_port = host_port self.QUERY_STATES = self.__beeswax_client.query_states def get_test_protocol(self): return BEESWAX def get_host_port(self): return self.__host_port def set_configuration_option(self, name, value, is_log_sql=True): # Only set the option if it's not already set to the same value. name = name.lower() value = str(value) if self.__beeswax_client.get_query_option(name) != value: self.__beeswax_client.set_query_option(name, value) if is_log_sql: self.log_client("\n\nset {0}={1};\n".format(name, value)) return True return False def clear_configuration(self): self.__beeswax_client.clear_query_options() # A hook in conftest sets tests.common.current_node. if hasattr(tests.common, "current_node"): self.set_configuration_option("client_identifier", tests.common.current_node) def connect(self): try: self.__beeswax_client.connect() self.log_client("connected to %s with beeswax" % self.__host_port) except Exception as e: self.log_client("failed connecting to %s with beeswax" % self.__host_port) raise e # TODO: rename to close_connection def close(self): self.log_client("closing beeswax connection to: %s" % self.__host_port) self.__beeswax_client.close_connection() def close_query(self, operation_handle, fetch_profile_after_close=False): self.log_handle(operation_handle, 'closing query for operation') return self.__beeswax_client.close_query(operation_handle.get_handle(), fetch_profile_after_close) def close_dml(self, operation_handle): self.log_handle(operation_handle, 'closing DML query') self.__beeswax_client.close_dml(operation_handle.get_handle()) def execute(self, sql_stmt, user=None, fetch_profile_after_close=False, fetch_exec_summary=False, profile_format=TRuntimeProfileFormat.STRING): assert profile_format == TRuntimeProfileFormat.STRING, ( "Beeswax client only supports getting runtime profile in STRING format.") self.log_client(u"executing against {0}\n{1}".format( self.__host_port, format_sql_for_logging(sql_stmt))) return self.__beeswax_client.execute(sql_stmt, user=user, fetch_profile_after_close=fetch_profile_after_close, fetch_exec_summary=fetch_exec_summary) def execute_async(self, sql_stmt, user=None): self.log_client(u"executing async {0}\n{1}".format( self.__host_port, format_sql_for_logging(sql_stmt))) beeswax_handle = self.__beeswax_client.execute_query_async(sql_stmt, user=user) return OperationHandle(beeswax_handle, sql_stmt) def cancel(self, operation_handle): self.log_handle(operation_handle, 'canceling operation') return self.__beeswax_client.cancel_query(operation_handle.get_handle()) def get_state(self, operation_handle): self.log_handle(operation_handle, 'getting state') return self.__beeswax_client.get_state(operation_handle.get_handle()) def get_impala_exec_state(self, operation_handle): return self.__QUERY_STATE_TO_EXEC_STATE[self.get_state(operation_handle)] def get_exec_summary(self, operation_handle): self.log_handle(operation_handle, 'getting exec summary operation') return self.__beeswax_client.get_exec_summary(operation_handle.get_handle()) def get_runtime_profile(self, operation_handle, profile_format=TRuntimeProfileFormat.STRING): assert profile_format == TRuntimeProfileFormat.STRING, ( "Beeswax client only supports getting runtime profile in STRING format.") self.log_handle(operation_handle, 'getting runtime profile operation') return self.__beeswax_client.get_runtime_profile(operation_handle.get_handle()) def wait_for_finished_timeout(self, operation_handle, timeout): self.log_handle(operation_handle, 'waiting for query to reach FINISHED state') return self.__beeswax_client.wait_for_finished_timeout( operation_handle.get_handle(), timeout) def wait_for_admission_control(self, operation_handle, timeout_s=60): self.log_handle(operation_handle, 'waiting for completion of the admission control') return self.__beeswax_client.wait_for_admission_control( operation_handle.get_handle(), timeout_s=timeout_s) def get_admission_result(self, operation_handle): self.log_handle(operation_handle, 'getting the admission result') return self.__beeswax_client.get_admission_result(operation_handle.get_handle()) def get_log(self, operation_handle): self.log_handle(operation_handle, 'getting log for operation') return self.__beeswax_client.get_log(operation_handle.get_handle().log_context) def fetch(self, sql_stmt, operation_handle, max_rows=-1, discard_results=False): self.log_handle(operation_handle, 'fetching {} rows'.format( 'all' if max_rows < 0 else max_rows)) return self.__beeswax_client.fetch_results( sql_stmt, operation_handle.get_handle(), max_rows, discard_results) def handle_id(self, operation_handle): query_id = operation_handle.get_handle().id return query_id if query_id else str(operation_handle) def log_handle(self, operation_handle, message): handle_id = self.handle_id(operation_handle) LOG.info(u"{0}: {1}".format(handle_id, message)) def get_query_id(self, operation_handle): return operation_handle.get_handle().id class ImpylaHS2Connection(ImpalaConnection): """Connection to Impala using the impyla client connecting to HS2 endpoint. impyla implements the standard Python dbabi: https://www.python.org/dev/peps/pep-0249/ plus Impala-specific extensions, e.g. for fetching runtime profiles. TODO: implement support for kerberos, SSL, etc. """ # ClientRequestState::TOperationState() __OPERATION_STATE_TO_EXEC_STATE = { 'INITIALIZED_STATE': INITIALIZED, 'PENDING_STATE': PENDING, 'RUNNING_STATE': RUNNING, 'FINISHED_STATE': FINISHED, 'ERROR_STATE': ERROR, # These are not official ExecState, but added to complete mapping. 'CANCELED_STATE': 'UNIMPLEMENTED_CANCELLED', 'CLOSED_STATE': 'UNIMPLEMENTED_CLOSED', 'UKNOWN_STATE': 'UNIMPLEMENTED_UNKNOWN' } def __init__(self, host_port, use_kerberos=False, is_hive=False, use_http_transport=False, http_path="", use_ssl=False, collect_profile_and_log=True, user=None): self.__host_port = host_port self.__use_http_transport = use_http_transport self.__http_path = http_path self.__use_ssl = use_ssl if use_kerberos: raise NotImplementedError("Kerberos support not yet implemented") # Impyla connection and cursor is initialised in connect(). We need to reuse the same # cursor for different operations (as opposed to creating a new cursor per operation) # so that the session is preserved. This means that we can only execute one operation # at a time per connection, which is a limitation also imposed by the Beeswax API. # However, for ease of async query testing, opening multiple cursors through single # ImpylaHS2Connection is allowed if executing query through execute_async() or # execute() with user parameter that is different than self.__user. Do note though # that they will not share the same session with self.__cursor. self.__impyla_conn = None self.__cursor = None # List of all cursors that created through execute_async. self.__async_cursors = list() # Query options to send along with each query. self.__query_options = {} self._is_hive = is_hive # Some Hive HS2 protocol, such as custom Calcite planner, may be able to collect # profile and log from Impala. self._collect_profile_and_log = collect_profile_and_log self.__user = user def get_test_protocol(self): if self.__http_path: return HS2_HTTP else: return HS2 def get_host_port(self): return self.__host_port def set_configuration_option(self, name, value, is_log_sql=True): # Only set the option if it's not already set to the same value. # value must be parsed to string. name = name.lower() value = str(value) if self.__query_options.get(name) != value: self.__query_options[name] = value if is_log_sql: self.log_client("\n\nset {0}={1};\n".format(name, value)) return True return False def clear_configuration(self): self.__query_options.clear() if hasattr(tests.common, "current_node") and not self._is_hive: self.set_configuration_option("client_identifier", tests.common.current_node) def __open_single_cursor(self, user=None): return self.__impyla_conn.cursor(user=user, convert_types=False, close_finished_queries=False) def __close_single_cursor(self, cursor): try: # Explicitly close the cursor so that it will close the session. cursor.close() except Exception: # The session may no longer be valid if the impalad was restarted during the test. pass def default_cursor(self): if self.__cursor is None: self.__cursor = self.__open_single_cursor(user=self.__user) return self.__cursor def connect(self): host, port = self.__host_port.split(":") conn_kwargs = {} if self._is_hive: conn_kwargs['auth_mechanism'] = 'PLAIN' try: self.__impyla_conn = impyla.connect( host=host, port=int(port), use_http_transport=self.__use_http_transport, http_path=self.__http_path, use_ssl=self.__use_ssl, **conn_kwargs) self.log_client("connected to {0} with impyla {1}".format( self.__host_port, self.get_test_protocol())) except Exception as e: self.log_client("failed connecting to {0} with impyla {1}".format( self.__host_port, self.get_test_protocol() )) raise e def close(self): self.log_client("closing 1 sync and {0} async {1} connections to: {2}".format( len(self.__async_cursors), self.get_test_protocol(), self.__host_port)) if self.__cursor is not None: self.__close_single_cursor(self.__cursor) for async_cursor in self.__async_cursors: self.__close_single_cursor(async_cursor) # Remove all async cursors. self.__async_cursors = list() try: self.__impyla_conn.close() except AttributeError as e: # When the HTTP endpoint restarts, Thrift HTTP will close the endpoint and calling # close() will result in an exception. if not (self.__use_http_transport and 'NoneType' in str(e)): raise def get_tables(self, database=None): """Trigger the GetTables() HS2 request on the given database (None means all dbs). Returns a list of (catalogName, dbName, tableName, tableType, tableComment). """ self.log_client("getting tables for database: {0}".format(database)) self.default_cursor().get_tables(database_name=database) return self.default_cursor().fetchall() def close_query(self, operation_handle, fetch_profile_after_close=False): self.log_handle(operation_handle, 'closing query for operation') # close_operation() will wipe out _last_operation. # Assign it to op_handle so that we can pull the profile after close_operation(). op_handle = operation_handle.get_handle()._last_operation operation_handle.get_handle().close_operation() if fetch_profile_after_close: assert self._collect_profile_and_log, ( "This connection is not configured to collect profile.") return op_handle.get_profile(TRuntimeProfileFormat.STRING) return None def __log_execute(self, cursor, user, sql_stmt): self.log_client( (u"executing against {0} at {1}. session: {2} main_cursor: {3} " u"user: {4}\n{5}").format( (self._is_hive and 'Hive' or 'Impala'), self.__host_port, self.__get_session_id(cursor), (cursor == self.default_cursor()), user, format_sql_for_logging(sql_stmt)) ) def execute(self, sql_stmt, user=None, fetch_profile_after_close=False, fetch_exec_summary=False, profile_format=TRuntimeProfileFormat.STRING): same_user = (user == self.__user) cursor = (self.default_cursor() if same_user # Must create a new cursor to supply 'user'. else self.__open_single_cursor(user=user)) result = None try: self.__log_execute(cursor, user, sql_stmt) cursor.execute(sql_stmt, configuration=self.__query_options) handle = OperationHandle(cursor, sql_stmt) self.log_handle(handle, "query started") result = self.__fetch_results_and_profile( handle, fetch_profile_after_close=fetch_profile_after_close, fetch_exec_summary=fetch_exec_summary, profile_format=profile_format) finally: cursor.close_operation() if not same_user: self.__close_single_cursor(cursor) return result def __fetch_results_and_profile( self, operation_handle, fetch_profile_after_close=False, fetch_exec_summary=False, profile_format=TRuntimeProfileFormat.STRING): r = None try: r = self.__fetch_results(operation_handle, fetch_exec_summary=fetch_exec_summary, profile_format=profile_format) finally: if r is None: # Try to close the query handle but ignore any exceptions not to replace the # original exception raised by '__fetch_results'. try: self.close_query(operation_handle) except Exception: pass elif fetch_profile_after_close: # Match ImpalaBeeswaxResult by placing the full profile including end time and # duration into the return object. r.runtime_profile = self.close_query(operation_handle, fetch_profile_after_close) return r else: self.close_query(operation_handle) return r def execute_async(self, sql_stmt, user=None): async_cursor = None try: async_cursor = self.__open_single_cursor(user=user) handle = OperationHandle(async_cursor, sql_stmt) self.__log_execute(async_cursor, user, sql_stmt) async_cursor.execute_async(sql_stmt, configuration=self.__query_options) self.__async_cursors.append(async_cursor) return handle except Exception as e: if async_cursor: async_cursor.close_operation() self.__close_single_cursor(async_cursor) raise e def cancel(self, operation_handle): self.log_handle(operation_handle, 'canceling operation') cursor = operation_handle.get_handle() return cursor.cancel_operation(reset_state=False) def get_query_id(self, operation_handle): """Return the string representation of the query id. Return empty string if handle is already canceled or closed.""" id = None last_op = operation_handle.get_handle()._last_operation if last_op is not None: id = op_handle_to_query_id(last_op.handle) return "" if id is None else id def __get_session_id(self, cursor): """Return the string representation of the session id. Return empty string if handle is already canceled or closed.""" id = None if cursor.session is not None: id = session_handle_to_session_id(cursor.session.handle) return "" if id is None else id def handle_id(self, operation_handle): query_id = self.get_query_id(operation_handle) return query_id if query_id else str(operation_handle) def get_state(self, operation_handle): self.log_handle(operation_handle, 'getting state') cursor = operation_handle.get_handle() # cursor.status contains a string representation of one of # TCLIService.TOperationState. return cursor.status() def get_impala_exec_state(self, operation_handle): try: return self.__OPERATION_STATE_TO_EXEC_STATE[self.get_state(operation_handle)] except impyla_error.Error: return ERROR except Exception as e: raise e def get_exec_summary(self, operation_handle): self.log_handle(operation_handle, 'getting exec summary operation') cursor = operation_handle.get_handle() # summary returned is thrift, not string. return cursor.get_summary() def get_runtime_profile(self, operation_handle, profile_format=TRuntimeProfileFormat.STRING): self.log_handle(operation_handle, 'getting runtime profile operation') cursor = operation_handle.get_handle() return cursor.get_profile(profile_format=profile_format) def wait_for_finished_timeout(self, operation_handle, timeout): self.log_handle(operation_handle, 'waiting for query to reach FINISHED state') start_time = time.time() while time.time() - start_time < timeout: start_rpc_time = time.time() impala_state = self.get_impala_exec_state(operation_handle) rpc_time = time.time() - start_rpc_time # if the rpc succeeded, the output is the query state if impala_state == FINISHED: return True elif impala_state == ERROR: try: error_log = operation_handle.get_handle().get_log() raise impyla_error.OperationalError(error_log, None) finally: self.close_query(operation_handle) if rpc_time < DEFAULT_SLEEP_INTERVAL: time.sleep(DEFAULT_SLEEP_INTERVAL - rpc_time) return False def wait_for_admission_control(self, operation_handle, timeout_s=60): self.log_handle(operation_handle, 'waiting for completion of the admission control') start_time = time.time() while time.time() - start_time < timeout_s: start_rpc_time = time.time() if self.is_admitted(operation_handle): return True rpc_time = time.time() - start_rpc_time if rpc_time < DEFAULT_SLEEP_INTERVAL: time.sleep(DEFAULT_SLEEP_INTERVAL - rpc_time) return False def get_admission_result(self, operation_handle): self.log_handle(operation_handle, 'getting the admission result') if self.is_admitted(operation_handle): query_profile = self.get_runtime_profile(operation_handle) admit_result = re.search(r"Admission result: (.*)", query_profile) if admit_result: return admit_result.group(1) return "" def get_log(self, operation_handle): self.log_handle(operation_handle, 'getting log for operation') # HS2 includes non-error log messages that we need to filter out. cursor = operation_handle.get_handle() lines = [line for line in cursor.get_log().split('\n') if not PROGRESS_LOG_RE.match(line)] return '\n'.join(lines) def fetch(self, sql_stmt, operation_handle, max_rows=-1, discard_results=False): self.log_handle(operation_handle, 'fetching {} rows'.format( 'all' if max_rows < 0 else max_rows)) return self.__fetch_results(operation_handle, max_rows, discard_results) def __fetch_results(self, handle, max_rows=-1, discard_results=False, fetch_exec_summary=False, profile_format=TRuntimeProfileFormat.STRING): """Implementation of result fetching from handle.""" cursor = handle.get_handle() assert cursor is not None # Don't fetch data for queries with no results. result_tuples = None column_labels = None column_types = None if cursor.has_result_set: desc = cursor.description column_labels = [col_desc[0].upper() for col_desc in desc] column_types = [col_desc[1].upper() for col_desc in desc] if max_rows < 0: result_tuples = cursor.fetchall() else: result_tuples = cursor.fetchmany(max_rows) result = None if discard_results: return result log = None profile = None exec_summary = None if not self._is_hive: if fetch_exec_summary: exec_summary = self.get_exec_summary_table(handle) if self._collect_profile_and_log: log = self.get_log(handle) profile = self.get_runtime_profile(handle, profile_format=profile_format) result = ImpylaHS2ResultSet(success=True, result_tuples=result_tuples, column_labels=column_labels, column_types=column_types, query=handle.sql_stmt(), log=log, profile=profile, query_id=self.get_query_id(handle), exec_summary=exec_summary) return result class ImpylaHS2ResultSet(object): """This emulates the interface of ImpalaBeeswaxResult so that it can be used in place of it. TODO: when we deprecate/remove Beeswax, clean this up.""" def __init__(self, success, result_tuples, column_labels, column_types, query, log, profile, query_id, exec_summary): self.success = success self.column_labels = column_labels self.column_types = column_types self.query = query self.log = log # ImpalaBeeswaxResult store profile at runtime_profile field self.runtime_profile = profile self.query_id = query_id self.__result_tuples = result_tuples # self.data is the data in the ImpalaBeeswaxResult format: a list of rows with each # row represented as a tab-separated string. self.data = None if result_tuples is not None: self.data = [self.__convert_result_row(tuple) for tuple in result_tuples] self.exec_summary = exec_summary def tuples(self): """Return the raw HS2 result set, which is a list of tuples.""" return self.__result_tuples def __convert_result_row(self, result_tuple): """Take primitive values from a result tuple and construct the tab-separated string that would have been returned via beeswax.""" return '\t'.join([self.__convert_result_value(val) for val in result_tuple]) def __convert_result_value(self, val): """Take a primitive value from a result tuple and its type and construct the string that would have been returned via beeswax.""" if val is None: return 'NULL' if type(val) == float: # Same format as what Beeswax uses in the backend. return "{:.16g}".format(val) else: return str(val) def create_connection(host_port, use_kerberos=False, protocol=BEESWAX, is_hive=False, use_ssl=False, collect_profile_and_log=True): if protocol == BEESWAX: c = BeeswaxConnection(host_port=host_port, use_kerberos=use_kerberos, use_ssl=use_ssl) elif protocol == HS2: c = ImpylaHS2Connection(host_port=host_port, use_kerberos=use_kerberos, is_hive=is_hive, use_ssl=use_ssl, collect_profile_and_log=collect_profile_and_log) else: assert protocol == HS2_HTTP c = ImpylaHS2Connection(host_port=host_port, use_kerberos=use_kerberos, is_hive=is_hive, use_http_transport=True, http_path='cliservice', use_ssl=use_ssl, collect_profile_and_log=collect_profile_and_log) # A hook in conftest sets tests.common.current_node. Skip for Hive connections since # Hive cannot modify client_identifier at runtime. if hasattr(tests.common, "current_node") and not is_hive: c.set_configuration_option("client_identifier", tests.common.current_node) return c def create_ldap_connection(host_port, user, password, use_ssl=False): return BeeswaxConnection(host_port=host_port, user=user, password=password, use_ssl=use_ssl) class MinimalHS2OperationHandle(OperationHandle): def __str__(self): return op_handle_to_query_id(self.get_handle()) class MinimalHS2Connection(ImpalaConnection): """ Connection to Impala using the HiveServer2 (HS2) protocol. This class does not use Impyla's DB-API cursors. Instead, it is built directly on the HS2 RPC layer to support manipulating one operation from multiple connections concurrently. This class is designed to be minimalistic to facilitate testing. Each method is mapped to only one Thrift RPC. """ def __init__(self, host_port, user=None): self.__host_port = host_port host, port = host_port.split(":") self.__conn = hs2.connect(host, port, auth_mechanism='NOSASL') self.__user = user if user is not None else getpass.getuser() self.__session = self.__conn.open_session(self.__user) def connect(self): pass # Do nothing def close(self): LOG.info("-- closing connection to: %s" % self.__host_port) try: self.__session.close() finally: self.__conn.close() def execute(self, sql_stmt, user=None, fetch_profile_after_close=False, # noqa: U100 fetch_exec_summary=False, # noqa: U100 profile_format=TRuntimeProfileFormat.STRING): # noqa: U100 raise NotImplementedError() def execute_async(self, sql_stmt): hs2_operation = self.__session.execute(sql_stmt) operation_handle = MinimalHS2OperationHandle(hs2_operation.handle, sql_stmt) LOG.info("Started query {0}".format(operation_handle)) return operation_handle def __get_operation(self, operation_handle): return hs2.Operation(self.__session, operation_handle.get_handle()) def fetch(self, sql_stmt, operation_handle, max_rows=-1): # noqa: U100 """ Fetch the results of the query. It will block the current connection if the results are not available yet. """ LOG.info("-- fetching results from: {0}".format(operation_handle)) return self.__get_operation(operation_handle).fetch(max_rows=max_rows) def fetch_error(self, operation_handle): """ Fetch the error of the query. """ try: self.fetch(None, operation_handle) assert False, "Failed to catch the error of the query." except Exception as exc: return exc def get_state(self, operation_handle): return self.__get_operation(operation_handle).get_status() def wait_for(self, operation_handle, timeout_s=60): """ Wait until the query is in a terminal state. """ start_time = time.time() while True: operation_state = self.get_state(operation_handle) if operation_state not in ("PENDING_STATE", "INITIALIZED_STATE", "RUNNING_STATE"): return operation_state if time.time() - start_time > timeout_s: raise Exception("Timed out waiting for the query") time.sleep(0.1) def cancel(self, operation_handle): LOG.info("-- canceling operation: {0}".format(operation_handle)) return self.__get_operation(operation_handle).cancel() def close_query(self, operation_handle): LOG.info("-- closing query for operation handle: {0}".format(operation_handle)) return self.__get_operation(operation_handle).close() def state_is_finished(self, operation_handle): # noqa: U100 raise NotImplementedError() def get_log(self, operation_handle): return self.__get_operation(operation_handle).get_log() def set_configuration_option(self, name, value): # noqa: U100 raise NotImplementedError() def clear_configuration(self): raise NotImplementedError() def get_host_port(self): return self.__host_port def get_test_protocol(self): return HS2 def handle_id(self, operation_handle): # noqa: U100 return str(operation_handle) def get_admission_result(self, operation_handle): # noqa: U100 raise NotImplementedError() def get_impala_exec_state(self, operation_handle): # noqa: U100 raise NotImplementedError() def get_runtime_profile(self, operation_handle, # noqa: U100 profile_format=TRuntimeProfileFormat.STRING): # noqa: U100 raise NotImplementedError() def wait_for_admission_control(self, operation_handle, timeout_s=60): # noqa: U100 raise NotImplementedError() def get_exec_summary(self, operation_handle): # noqa: U100 raise NotImplementedError()