#!/usr/bin/env python # Copyright 2012 Cloudera Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # Impala's shell import cmd import csv import prettytable import time import sys import os import signal import socket import threading from optparse import OptionParser import getpass from beeswaxd import BeeswaxService from beeswaxd.BeeswaxService import QueryState from ImpalaService import ImpalaService from ImpalaService.ImpalaService import TImpalaQueryOptions from Status.ttypes import TStatus, TStatusCode from thrift.transport.TSocket import TSocket from thrift.transport.TTransport import TBufferedTransport, TTransportException from thrift.protocol import TBinaryProtocol from thrift.Thrift import TApplicationException VERSION_FORMAT = "Impala v%(version)s (%(git_hash)s) built on %(build_date)s" COMMENT_TOKEN = '--' VERSION_STRING = "build version not available" HISTORY_LENGTH = 100 # Tarball / packaging build makes impala_build_version available try: from impala_build_version import get_git_hash, get_build_date, get_version VERSION_STRING = VERSION_FORMAT % {'version': get_version(), 'git_hash': get_git_hash()[:7], 'build_date': get_build_date()} except Exception: pass class RpcStatus: """Convenience enum to describe Rpc return statuses""" OK = 0 ERROR = 1 class OutputWriter(object): """Helper class for saving result set output to a file""" def __init__(self, file_name, field_delim): # The default csv field size limit is too small to write large result sets. Set it to # an artibrarily large value. csv.field_size_limit(sys.maxint) self.file_name = file_name if not field_delim: raise ValueError, 'A field delimiter is required to output results to a file' self.field_delim = field_delim.decode('string-escape') if len(self.field_delim) != 1: raise ValueError, 'Field delimiter must be a 1-character string' def write_rows(self, rows, mode='ab'): output_file = None try: output_file = open(self.file_name, mode) writer =\ csv.writer(output_file, delimiter=self.field_delim, quoting=csv.QUOTE_MINIMAL) writer.writerows(rows) finally: if output_file: output_file.close() # Simple Impala shell. Can issue queries (with configurable options) # Basic usage: type connect to connect to an impalad # Then issue queries or other commands. Tab-completion should show the set of # available commands. # Methods that implement shell commands return a boolean tuple (stop, status) # stop is a flag the command loop uses to continue/discontinue the prompt. # Status tells the caller that the command completed successfully. # TODO: (amongst others) # - Column headers / metadata support # - Report profiles # - A lot of rpcs return a verbose TStatus from thrift/Status.thrift # This will be useful for better error handling. The next iteration # of the shell should handle this return paramter. class ImpalaShell(cmd.Cmd): DISCONNECTED_PROMPT = "[Not connected] > " # Commands are terminated with the following delimiter. CMD_DELIM = ';' def __init__(self, options): cmd.Cmd.__init__(self) self.user = getpass.getuser() self.is_alive = True self.use_kerberos = options.use_kerberos self.verbose = options.verbose self.kerberos_service_name = options.kerberos_service_name self.impalad = None self.prompt = ImpalaShell.DISCONNECTED_PROMPT self.connected = False self.imp_service = None self.transport = None self.fetch_batch_size = 1024 self.default_query_options = {} self.set_query_options = {} self.query_state = QueryState._NAMES_TO_VALUES self.refresh_after_connect = options.refresh_after_connect self.default_db = options.default_db self.history_file = os.path.expanduser("~/.impalahistory") self.show_profiles = options.show_profiles # Stores the state of user input until a delimiter is seen. self.partial_cmd = str() # Stores the old prompt while the user input is incomplete. self.cached_prompt = str() # Tracks query handle of the last query executed. Used by the 'profile' command. self.last_query_handle = None self.output_writer = None if options.output_file: self.output_writer =\ OutputWriter(options.output_file, options.output_file_field_delim) try: self.readline = __import__('readline') self.readline.set_history_length(HISTORY_LENGTH) except ImportError: self.readline = None if options.impalad != None: self.do_connect(options.impalad) # We handle Ctrl-C ourselves, using an Event object to signal cancellation # requests between the handler and the main shell thread self.is_interrupted = threading.Event() signal.signal(signal.SIGINT, self.__signal_handler) def __print_options(self, options): if not options: print '\tNo options available.' else: print '\n'.join(["\t%s: %s" % (k,v) for (k,v) in options.iteritems()]) def __options_to_string_list(self): return ["%s=%s" % (k,v) for (k,v) in self.set_query_options.iteritems()] def __build_default_query_options_dict(self): # The default query options are retrieved from a rpc call, and are dependent # on the impalad to which a connection has been established. They need to be # refreshed each time a connection is made. This is particularly helpful when # there is a version mismatch between the shell and the impalad. get_default_query_options = self.imp_service.get_default_configuration(False) options, status = self.__do_rpc(lambda: get_default_query_options) if status != RpcStatus.OK: print 'Unable to retrive default query options' for option in options: self.default_query_options[option.key.upper()] = option.value def do_shell(self, args): """Run a command on the shell Usage: shell ! """ try: os.system(args) except Exception, e: print 'Error running command : %s' % e return True def sanitise_input(self, args, interactive=True): """Convert the command to lower case, so it's recognized""" # A command terminated by a semi-colon is legal. Check for the trailing # semi-colons and strip them from the end of the command. args = args.strip() tokens = args.split(' ') # The first token should be the command # If it's EOF, call do_quit() if tokens[0] == 'EOF': return 'quit' else: tokens[0] = tokens[0].lower() if interactive: args = self.__check_for_command_completion(' '.join(tokens).strip()) # We escape \n in multiline commands to enable history to be read properly. # As such, some commands will have \n escaped, this takes care of un-escaping them. args = args.rstrip(ImpalaShell.CMD_DELIM).decode('string-escape') else: args = ' '.join(tokens) return args def __check_for_command_completion(self, cmd): """Check for a delimiter at the end of user input. The end of the user input is scanned for a legal delimiter. If a delimiter is not found: - Input is not send to onecmd() - onecmd() is a method in Cmd which routes the user input to the appropriate method. An empty string results in a no-op. - Input is removed from history. - Input is appended to partial_cmd If a delimiter is found: - The contents of partial_cmd are put in history, as they represent a completed command. - The contents are passed to the appropriate method for execution. - partial_cmd is reset to an empty string. """ if self.readline: current_history_len = self.readline.get_current_history_length() # Input is incomplete, store the contents and do nothing. if not cmd.endswith(ImpalaShell.CMD_DELIM): # The user input is incomplete, change the prompt to reflect this. if not self.partial_cmd and cmd: self.cached_prompt = self.prompt self.prompt = '> '.rjust(len(self.cached_prompt)) # partial_cmd is already populated, add the current input after a newline. if self.partial_cmd and cmd: self.partial_cmd = "%s\n%s" % (self.partial_cmd, cmd) else: # If the input string is empty or partial_cmd is empty. self.partial_cmd = "%s%s" % (self.partial_cmd, cmd) # Remove the most recent item from history if: # -- The current state of user input in incomplete. # -- The most recent user input is not an empty string if self.readline and current_history_len > 0 and cmd: self.readline.remove_history_item(current_history_len - 1) # An empty string results in a no-op. Look at emptyline() return str() elif self.partial_cmd: # input ends with a delimiter and partial_cmd is not empty if cmd != ImpalaShell.CMD_DELIM: completed_cmd = "%s\n%s" % (self.partial_cmd, cmd) else: completed_cmd = "%s%s" % (self.partial_cmd, cmd) # Reset partial_cmd to an empty string self.partial_cmd = str() # Replace the most recent history item with the completed command. # In order for it to be read from the history file, the \n has # to be escaped. if self.readline and current_history_len > 0: self.readline.replace_history_item(current_history_len - 1, completed_cmd.encode('string-escape')) # Revert the prompt to its earlier state self.prompt = self.cached_prompt else: # Input has a delimiter and partial_cmd is empty completed_cmd = cmd return completed_cmd def __signal_handler(self, signal, frame): self.is_interrupted.set() def precmd(self, args): # TODO: Add support for multiple commands on the same line. self.is_interrupted.clear() return self.sanitise_input(args) def postcmd(self, status, args): """Hack to make non interactive mode work""" self.is_interrupted.clear() # cmd expects return of False to keep going, and True to quit. # Shell commands return True on success, False on error, and None to quit, so # translate between them. # TODO : Remove in the future once shell and Impala query processing can be separated. if status == None: return True else: return False def do_set(self, args): """Set or display query options. Display query options: Usage: SET Set query options: Usage: SET