mirror of
https://github.com/apache/impala.git
synced 2026-01-22 09:01:58 -05:00
HS2 is added as an option via --protocol=hs2. The user-visible differences in behaviour are minimal. Beeswax is still the default and can be explicitly enabled via --protocol=beeswax but will be deprecated. The default is unchanged because changing the default could break certain workflows, e.g. those that explicitly specify the port with -i or deployments that hit --fe_service_threads for HS2 and somehow rely on impala-shell not contributing to that limit. For most workflows the change is transparent and we should change the default in a major version change. This support requires Impala-specific extensions to the HS2 interface, similar to the existing extensions to Beeswax. Thus the HS2 shell is only forwards-compatible with newer Impala versions. I considered trying to gracefully degrade when the new extensions weren't present, but it didn't seem to be worth the ongoing testing effort. Differences between HS2 and Beeswax are abstracted into ImpalaClient subclasses. Here are the changes required to make it work: * Switch to TBinaryProtocolAccelerated to avoid perf regression. The HS2 protocol requires decoding more primitive values (because its not a string-per-row), which was slow with the pure python implementation of TBinaryProtocol. * Added bitarray module to efficiently unpack null indicators * Minimise invasiveness of changes by transposing and stringifying the columnar results into rows in impala_client.py. The transposition needs to happen before display anyway. * Add PingImpalaHS2Service() to get back version string and webserver address. * Add CloseImpalaOperation() extension to return DML row counts. This possibly addresses IMPALA-1789, although we need to confirm that this is a sufficient solution. * Add is_closed member to query handles to avoid shell independently tracking whether the query handle was closed or not. * Include query status in HS2 log to match beeswax. * HS2 GetLog() command now includes query status error message for consistency with beeswax. * "set"/"set all" uses the client requests options, not the session default. This captures the effective value of TIMEZONE, which was previously missing. This also requires test changes where the tests set non-default values, e.g. for ABORT_ON_ERROR. * "set all" on the server side returns REMOVED query options - the shell needs to know these so it can correctly ignore them. * Clean up self.orig_cmd/self.last_leading comment argument passing to avoid implicit parameter passing through multiple function calls. * Clean up argument handling in shell tests to consistently pass around lists of arguments instead of strings that are subject to shell tokenisation rules. * Consistently close connections in the shell to avoid leaking HS2 sessions. This is enforced by making ImpalaShell a context manager and also eliminating all sys.exit() calls that would bypass the explicit connection closing. Testing: * Shell tests can run with both protocols * Add tests for formatting of all types and NULL values * Added testing for floating point output formatting, which does change as a result of switching to server-side vs client-side formatting. * Verified that newly-added tests were actually going through HS2 by disabling hs2 on the minicluster and running tests. * Add checks to test_verify_metrics.py to ensure that no sessions are left open at the end of tests. Performance: Baseline from beeswax shell for large extract is as follows: $ time impala-shell.sh -B -q 'select * from tpch_parquet.orders' > /dev/null real 0m6.708s user 0m5.132s sys 0m0.204s After this change it is somewhat slower, but we generally don't consider bulk extract performance through the shell to be perf-critical: real 0m7.625s user 0m6.436s sys 0m0.256s Change-Id: I6d5cc83d545aacc659523f29b1d6feed672e2a12 Reviewed-on: http://gerrit.cloudera.org:8080/12884 Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
117 lines
3.8 KiB
Python
117 lines
3.8 KiB
Python
"""
|
|
This package defines an object type which can efficiently represent
|
|
a bitarray. Bitarrays are sequence types and behave very much like lists.
|
|
|
|
Please find a description of this package at:
|
|
|
|
http://pypi.python.org/pypi/bitarray/
|
|
|
|
Author: Ilan Schnell
|
|
"""
|
|
from bitarray._bitarray import _bitarray, bitdiff, bits2bytes, _sysinfo
|
|
|
|
__version__ = '0.9.0'
|
|
|
|
|
|
def _check_codedict(codedict):
|
|
if not isinstance(codedict, dict):
|
|
raise TypeError("dictionary expected")
|
|
if len(codedict) == 0:
|
|
raise ValueError("prefix code empty")
|
|
for k, v in codedict.items():
|
|
if not isinstance(v, bitarray):
|
|
raise TypeError("bitarray expected for dictionary value")
|
|
if v.length() == 0:
|
|
raise ValueError("non-empty bitarray expected")
|
|
|
|
|
|
class bitarray(_bitarray):
|
|
"""bitarray([initial], [endian=string])
|
|
|
|
Return a new bitarray object whose items are bits initialized from
|
|
the optional initial, and endianness.
|
|
If no object is provided, the bitarray is initialized to have length zero.
|
|
The initial object may be of the following types:
|
|
|
|
int, long
|
|
Create bitarray of length given by the integer. The initial values
|
|
in the array are random, because only the memory allocated.
|
|
|
|
string
|
|
Create bitarray from a string of '0's and '1's.
|
|
|
|
list, tuple, iterable
|
|
Create bitarray from a sequence, each element in the sequence is
|
|
converted to a bit using truth value value.
|
|
|
|
bitarray
|
|
Create bitarray from another bitarray. This is done by copying the
|
|
memory holding the bitarray data, and is hence very fast.
|
|
|
|
The optional keyword arguments 'endian' specifies the bit endianness of the
|
|
created bitarray object.
|
|
Allowed values are 'big' and 'little' (default is 'big').
|
|
|
|
Note that setting the bit endianness only has an effect when accessing the
|
|
machine representation of the bitarray, i.e. when using the methods: tofile,
|
|
fromfile, tobytes, frombytes."""
|
|
|
|
def fromstring(self, string):
|
|
"""fromstring(string)
|
|
|
|
Append from a string, interpreting the string as machine values.
|
|
Deprecated since version 0.4.0, use ``frombytes()`` instead."""
|
|
return self.frombytes(string.encode())
|
|
|
|
def tostring(self):
|
|
"""tostring() -> string
|
|
|
|
Return the string representing (machine values) of the bitarray.
|
|
When the length of the bitarray is not a multiple of 8, the few remaining
|
|
bits (1..7) are set to 0.
|
|
Deprecated since version 0.4.0, use ``tobytes()`` instead."""
|
|
return self.tobytes().decode()
|
|
|
|
def decode(self, codedict):
|
|
"""decode(code) -> list
|
|
|
|
Given a prefix code (a dict mapping symbols to bitarrays),
|
|
decode the content of the bitarray and return the list of symbols."""
|
|
_check_codedict(codedict)
|
|
return self._decode(codedict)
|
|
|
|
def iterdecode(self, codedict):
|
|
"""iterdecode(code) -> iterator
|
|
|
|
Given a prefix code (a dict mapping symbols to bitarrays),
|
|
decode the content of the bitarray and iterate over the symbols."""
|
|
_check_codedict(codedict)
|
|
return self._iterdecode(codedict)
|
|
|
|
def encode(self, codedict, iterable):
|
|
"""encode(code, iterable)
|
|
|
|
Given a prefix code (a dict mapping symbols to bitarrays),
|
|
iterate over the iterable object with symbols, and extend the bitarray
|
|
with the corresponding bitarray for each symbols."""
|
|
_check_codedict(codedict)
|
|
self._encode(codedict, iterable)
|
|
|
|
def __int__(self):
|
|
raise TypeError("int() argument cannot be a bitarray")
|
|
|
|
def __long__(self):
|
|
raise TypeError("long() argument cannot be a bitarray")
|
|
|
|
def __float__(self):
|
|
raise TypeError("float() argument cannot be a bitarray")
|
|
|
|
|
|
def test(verbosity=1, repeat=1):
|
|
"""test(verbosity=1, repeat=1) -> TextTestResult
|
|
|
|
Run self-test, and return unittest.runner.TextTestResult object.
|
|
"""
|
|
from bitarray import test_bitarray
|
|
return test_bitarray.run(verbosity=verbosity, repeat=repeat)
|