#!/usr/bin/env python # Copyright 2012 Cloudera Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # Impala's shell import cmd import getpass import prettytable import os import sasl import shlex import signal import socket import sqlparse import sys import threading import time import re from optparse import OptionParser from Queue import Queue, Empty from shell_output import OutputStream, DelimitedOutputFormatter, PrettyOutputFormatter from subprocess import call from beeswaxd import BeeswaxService from beeswaxd.BeeswaxService import QueryState from ImpalaService import ImpalaService from ImpalaService.ImpalaService import TImpalaQueryOptions, TResetTableReq from ExecStats.ttypes import TExecStats from ImpalaService.ImpalaService import TPingImpalaServiceResp from Status.ttypes import TStatus, TStatusCode from thrift_sasl import TSaslClientTransport from thrift.transport.TSocket import TSocket from thrift.transport.TTransport import TBufferedTransport, TTransportException from thrift.protocol import TBinaryProtocol from thrift.Thrift import TApplicationException VERSION_FORMAT = "Impala Shell v%(version)s (%(git_hash)s) built on %(build_date)s" VERSION_STRING = "build version not available" HISTORY_LENGTH = 100 # Tarball / packaging build makes impala_build_version available try: from impala_build_version import get_git_hash, get_build_date, get_version VERSION_STRING = VERSION_FORMAT % {'version': get_version(), 'git_hash': get_git_hash()[:7], 'build_date': get_build_date()} except Exception: pass class ImpalaPrettyTable(prettytable.PrettyTable): """Patched version of PrettyTable that TODO""" def _unicode(self, value): if not isinstance(value, basestring): value = str(value) if not isinstance(value, unicode): # If a value cannot be encoded, replace it with a placeholder. value = unicode(value, self.encoding, "replace") return value class RpcStatus: """Convenience enum to describe Rpc return statuses""" OK = 0 ERROR = 1 class RpcResult(object): """Wrapper for Rpc results. An Rpc results consists of the status and the result of the rpc. If a queue object is passed to the ctor, get_results blocks until there's a result in the queue. If the rpc result and status are passed to the ctor, it acts as a wrapper and returns them. """ def __init__(self, queue=None, result=None, status=None): self.result_queue = queue self.result = result self.status = status def get_results(self): if self.result_queue: # Block until the results are available. # queue.get() without a timeout is not interruptable with KeyboardInterrupt. # Set the timeout to a day (a reasonable limit for a single rpc call) self.result, self.status = self.result_queue.get(True, 60 * 24 * 24) return self.result, self.status class ImpalaShell(cmd.Cmd): """ Simple Impala Shell. Basic usage: type connect to connect to an impalad Then issue queries or other commands. Tab-completion should show the set of available commands. Methods that implement shell commands return a boolean tuple (stop, status) stop is a flag the command loop uses to continue/discontinue the prompt. Status tells the caller that the command completed successfully. """ DISCONNECTED_PROMPT = "[Not connected] > " # If not connected to an impalad, the server version is unknown. UNKNOWN_SERVER_VERSION = "Not Connected" # Commands are terminated with the following delimiter. CMD_DELIM = ';' DEFAULT_DB = 'default' # Regex applied to all tokens of a query to detect the query type. INSERT_REGEX = re.compile("^insert$", re.I) def __init__(self, options): cmd.Cmd.__init__(self) self.user = options.user self.is_alive = True self.use_kerberos = options.use_kerberos self.verbose = options.verbose self.kerberos_service_name = options.kerberos_service_name self.impalad = None self.prompt = ImpalaShell.DISCONNECTED_PROMPT self.connected = False self.imp_service = None self.transport = None self.fetch_batch_size = 1024 self.default_query_options = {} self.set_query_options = {} self.query_state = QueryState._NAMES_TO_VALUES self.refresh_after_connect = options.refresh_after_connect self.current_db = options.default_db self.history_file = os.path.expanduser("~/.impalahistory") self.server_version = ImpalaShell.UNKNOWN_SERVER_VERSION self.show_profiles = options.show_profiles self.ssl_enabled = options.ssl self.ca_cert = options.ca_cert self.use_ldap = options.use_ldap # Stores the state of user input until a delimiter is seen. self.partial_cmd = str() # Stores the old prompt while the user input is incomplete. self.cached_prompt = str() # Tracks query handle of the last query executed. Used by the 'profile' command. self.last_query_handle = None self.query_handle_closed = False # Indicates whether the rpc being run is interruptable. If True, the signal handler # reconnects and closes the query handle. self.rpc_is_interruptable = False self.output_file = options.output_file # Output formatting flags/options self.output_delimiter = options.output_delimiter self.write_delimited = options.write_delimited self.print_header = options.print_header if options.strict_unicode: self.utf8_encode_policy = 'strict' else: self.utf8_encode_policy = 'ignore' self.__populate_command_list() try: self.readline = __import__('readline') self.readline.set_history_length(HISTORY_LENGTH) except ImportError: self.__disable_readline() if self.use_ldap: self.ldap_password = getpass.getpass("LDAP password for %s:" % self.user) if options.impalad != None: self.do_connect(options.impalad) # We handle Ctrl-C ourselves, using an Event object to signal cancellation # requests between the handler and the main shell thread. Ctrl-C is explicitly # not intercepted during an rpc, as it may interrupt system calls leaving # the underlying socket unusable. self.is_interrupted = threading.Event() signal.signal(signal.SIGINT, self.__signal_handler) def __populate_command_list(self): """Populate a list of commands in the shell. Each command has its own method of the form do_, and can be extracted by introspecting the class directory. """ # Slice the command method name to get the name of the command. self.commands = [cmd[3:] for cmd in dir(self.__class__) if cmd.startswith('do_')] def __disable_readline(self): """Disables the readline module. The readline module is responsible for keeping track of command history. """ self.readline = None def __print_options(self, default_options, set_options): # Prints the current query options # with default values distinguished from set values by brackets [] if not default_options and not set_options: print '\tNo options available.' else: for k in sorted(default_options.keys()): if k in set_options.keys() and set_options[k] != default_options[k]: print '\n'.join(["\t%s: %s" % (k, set_options[k])]) else: print '\n'.join(["\t%s: [%s]" % (k, default_options[k])]) def __options_to_string_list(self): return ["%s=%s" % (k, v) for (k, v) in self.set_query_options.iteritems()] def __build_default_query_options_dict(self): # The default query options are retrieved from a rpc call, and are dependent # on the impalad to which a connection has been established. They need to be # refreshed each time a connection is made. This is particularly helpful when # there is a version mismatch between the shell and the impalad. get_default_query_options = self.imp_service.get_default_configuration(False) rpc_result = self.__do_rpc(lambda: get_default_query_options) options, status = rpc_result.get_results() if status != RpcStatus.OK: print_to_stderr('Unable to retrieve default query options') for option in options: self.default_query_options[option.key.upper()] = option.value def do_shell(self, args): """Run a command on the shell Usage: shell ! """ try: os.system(args) except Exception, e: print_to_stderr('Error running command : %s' % e) return True def sanitise_input(self, args, interactive=True): """Convert the command to lower case, so it's recognized""" # A command terminated by a semi-colon is legal. Check for the trailing # semi-colons and strip them from the end of the command. args = args.strip() tokens = args.split(' ') if not interactive: tokens[0] = tokens[0].lower() # Strip all the non-interactive commands of the delimiter. return ' '.join(tokens).rstrip(ImpalaShell.CMD_DELIM) # The first token is converted into lower case to route it to the # appropriate command handler. This only applies to the first line of user input. # Modifying tokens in subsequent lines may change the semantics of the command, # so do not modify the text. if not self.partial_cmd: # The first token is the command. # If it's EOF, call do_quit() if tokens[0] == 'EOF': return 'quit' else: tokens[0] = tokens[0].lower() elif tokens[0] == "EOF": # If a command is in progress and the user hits a Ctrl-D, clear its state # and reset the prompt. self.prompt = self.cached_prompt self.partial_cmd = str() # The print statement makes the new prompt appear in a new line. # Also print an extra newline to indicate that the current command has # been cancelled. print '\n' return str() args = self.__check_for_command_completion(' '.join(tokens).strip()) return args.rstrip(ImpalaShell.CMD_DELIM) def __cmd_ends_with_delim(self, line): """Check if the input command ends with a command delimiter. A command ending with the delimiter and containing an open quotation character is not considered terminated. If no open quotation is found, it's considered terminated. """ if line.endswith(ImpalaShell.CMD_DELIM): try: # Look for an open quotation in the entire command, and not just the # current line. if self.partial_cmd: line = '%s %s' % (self.partial_cmd, line) shlex.split(line) return True # If the command ends with a delimiter, check if it has an open quotation. # shlex.split() throws a ValueError iff an open quoation is found. # A quotation can either be a single quote or a double quote. except ValueError: pass return False def __check_for_command_completion(self, cmd): """Check for a delimiter at the end of user input. The end of the user input is scanned for a legal delimiter. If a delimiter is not found: - Input is not send to onecmd() - onecmd() is a method in Cmd which routes the user input to the appropriate method. An empty string results in a no-op. - Input is removed from history. - Input is appended to partial_cmd If a delimiter is found: - The contents of partial_cmd are put in history, as they represent a completed command. - The contents are passed to the appropriate method for execution. - partial_cmd is reset to an empty string. """ if self.readline: current_history_len = self.readline.get_current_history_length() # Input is incomplete, store the contents and do nothing. if not self.__cmd_ends_with_delim(cmd): # The user input is incomplete, change the prompt to reflect this. if not self.partial_cmd and cmd: self.cached_prompt = self.prompt self.prompt = '> '.rjust(len(self.cached_prompt)) # partial_cmd is already populated, add the current input after a newline. if self.partial_cmd and cmd: self.partial_cmd = "%s\n%s" % (self.partial_cmd, cmd) else: # If the input string is empty or partial_cmd is empty. self.partial_cmd = "%s%s" % (self.partial_cmd, cmd) # Remove the most recent item from history if: # -- The current state of user input in incomplete. # -- The most recent user input is not an empty string if self.readline and current_history_len > 0 and cmd: self.readline.remove_history_item(current_history_len - 1) # An empty string results in a no-op. Look at emptyline() return str() elif self.partial_cmd: # input ends with a delimiter and partial_cmd is not empty if cmd != ImpalaShell.CMD_DELIM: completed_cmd = "%s\n%s" % (self.partial_cmd, cmd) else: completed_cmd = "%s%s" % (self.partial_cmd, cmd) # Reset partial_cmd to an empty string self.partial_cmd = str() # Replace the most recent history item with the completed command. completed_cmd = sqlparse.format(completed_cmd, strip_comments=True) if self.readline and current_history_len > 0: # Update the history item to replace newlines with spaces. This is needed so # readline can properly restore the history (otherwise it interprets each newline # as a separate history item). self.readline.replace_history_item(current_history_len - 1, completed_cmd.replace('\n', ' ')) # Revert the prompt to its earlier state self.prompt = self.cached_prompt else: # Input has a delimiter and partial_cmd is empty completed_cmd = sqlparse.format(cmd, strip_comments=True) # The comments have been parsed out, there is no need to retain the newlines. # They can cause parse errors in sqlparse when unescaped quotes and delimiters # come into play. return completed_cmd.replace('\n', ' ') def __signal_handler(self, signal, frame): self.is_interrupted.set() if not self.rpc_is_interruptable: return # If the is_interruptable event object is set, the rpc may be still in progress. # Create a new connection to the impalad and cancel the query. try: self.__connect() print_to_stderr('Closing Query handle.') self.__close_query() if self.current_db: self.cmdqueue.append('use %s' % self.current_db + ImpalaShell.CMD_DELIM) except Exception, e: print_to_stderr("Failed to reconnect and close: %s" % str(e)) def precmd(self, args): self.is_interrupted.clear() self.rpc_is_interruptable = False args = self.sanitise_input(args) if not args: return args # Split args using sqlparse. If there are multiple queries present in user input, # the length of the returned query list will be greater than one. parsed_cmds = sqlparse.split(args) if len(parsed_cmds) > 1: # The last command needs a delimiter to be successfully executed. parsed_cmds[-1] += ImpalaShell.CMD_DELIM self.cmdqueue.extend(parsed_cmds) # If cmdqueue is populated, then commands are executed from the cmdqueue, and user # input is ignored. Send an empty string as the user input just to be safe. return str() return args.encode('utf-8', self.utf8_encode_policy) def postcmd(self, status, args): """Hack to make non interactive mode work""" self.is_interrupted.clear() # cmd expects return of False to keep going, and True to quit. # Shell commands return True on success, False on error, and None to quit, so # translate between them. # TODO : Remove in the future once shell and Impala query processing can be separated. if status == None: return True else: return False def __build_summary_table(self, summary, idx, is_fragment_root, indent_level, output): """Direct translation of Coordinator::PrintExecSummary() to recursively build a list of rows of summary statistics, one per exec node summary: the TExecSummary object that contains all the summary data idx: the index of the node to print is_fragment_root: true if the node to print is the root of a fragment (and therefore feeds into an exchange) indent_level: the number of spaces to print before writing the node's label, to give the appearance of a tree. The 0th child of a node has the same indent_level as its parent. All other children have an indent_level of one greater than their parent. output: the list of rows into which to append the rows produced for this node and its children. Returns the index of the next exec node in summary.exec_nodes that should be processed, used internally to this method only. """ attrs = ["latency_ns", "cpu_time_ns", "cardinality", "memory_used"] # Initialise aggregate and maximum stats agg_stats, max_stats = TExecStats(), TExecStats() for attr in attrs: setattr(agg_stats, attr, 0) setattr(max_stats, attr, 0) node = summary.nodes[idx] for stats in node.exec_stats: for attr in attrs: val = getattr(stats, attr) if val is not None: setattr(agg_stats, attr, getattr(agg_stats, attr) + val) setattr(max_stats, attr, max(getattr(max_stats, attr), val)) if len(node.exec_stats) > 0: avg_time = agg_stats.latency_ns / len(node.exec_stats) else: avg_time = 0 # If the node is a broadcast-receiving exchange node, the cardinality of rows produced # is the max over all instances (which should all have received the same number of # rows). Otherwise, the cardinality is the sum over all instances which process # disjoint partitions. if node.is_broadcast and is_fragment_root: cardinality = max_stats.cardinality else: cardinality = agg_stats.cardinality est_stats = node.estimated_stats label_prefix = "" if indent_level > 0: label_prefix = "|" if is_fragment_root: label_prefix += " " * indent_level else: label_prefix += "--" * indent_level def prettyprint(val, units, divisor): for unit in units: if val < divisor: if unit == units[0]: return "%d%s" % (val, unit) else: return "%3.2f%s" % (val, unit) val /= divisor def prettyprint_bytes(byte_val): return prettyprint(byte_val, [' B', ' KB', ' MB', ' GB', ' TB'], 1024.0) def prettyprint_units(unit_val): return prettyprint(unit_val, ["", "K", "M", "B"], 1000.0) def prettyprint_time(time_val): return prettyprint(time_val, ["ns", "us", "ms", "s"], 1000.0) row = [ label_prefix + node.label, len(node.exec_stats), prettyprint_time(avg_time), prettyprint_time(max_stats.latency_ns), prettyprint_units(cardinality), prettyprint_units(est_stats.cardinality), prettyprint_bytes(max_stats.memory_used), prettyprint_bytes(est_stats.memory_used), node.label_detail ] output.append(row) try: sender_idx = summary.exch_to_sender_map[idx] # This is an exchange node, so the sender is a fragment root, and should be printed # next. self.__build_summary_table(summary, sender_idx, True, indent_level, output) except (KeyError, TypeError): # Fall through if idx not in map, or if exch_to_sender_map itself is not set pass idx += 1 if node.num_children > 0: first_child_output = [] idx = \ self.__build_summary_table(summary, idx, False, indent_level, first_child_output) for child_idx in xrange(1, node.num_children): # All other children are indented (we only have 0, 1 or 2 children for every exec # node at the moment) idx = self.__build_summary_table(summary, idx, False, indent_level + 1, output) output += first_child_output return idx def do_summary(self, args): if not self.connected: print_to_stderr("Must be connected to an Impala demon to retrieve query summaries") return True summary = self.__get_summary() if summary is None: print_to_stderr("Could not retrieve summary for query.") return True if summary.nodes is None: print_to_stderr("Summary not available") return True output = [] table = self.__construct_table_header(["Operator", "#Hosts", "Avg Time", "Max Time", "#Rows", "Est. #Rows", "Peak Mem", "Est. Peak Mem", "Detail"]) self.__build_summary_table(summary, 0, False, 0, output) formatter = PrettyOutputFormatter(table) self.output_stream = OutputStream(formatter, filename=self.output_file) self.output_stream.write(output) return True def do_set(self, args): """Set or display query options. Display query options: Usage: SET Set query options: Usage: SET