Files
impala/tests/comparison/types.py
casey c413c03517 Misc updates to the query generator (part 2 of 2)
Summary of changes:

  1) (from Taras) Exercise CTAS and views by creating one from a random
     query, then SELECT * FROM table/view.

  2) Use bulk loading to generate random data. The old method was to use
     INSERTs which is very slow. Now local data files are generated and
     uploaded.

  3) Misc schema parsing changes needed to support the simplified type
     system in the earlier review (part 1).

Change-Id: I7986b97aa12051dc043faafef34a9540117e889f
Reviewed-on: http://gerrit.sjc.cloudera.com:8080/5646
Reviewed-by: Casey Ching <casey@cloudera.com>
Tested-by: Casey Ching <casey@cloudera.com>
Reviewed-by: Ishaan Joshi <ishaan@cloudera.com>
Tested-by: Ishaan Joshi <ishaan@cloudera.com>
2014-12-19 16:37:46 -08:00

226 lines
4.8 KiB
Python

# Copyright (c) 2014 Cloudera, Inc. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from collections import defaultdict
from tests.comparison.common import ValExpr, ValExprList
class DataTypeMetaclass(type):
'''Provides sorting of classes used to determine upcasting.'''
def __init__(cls, name, bases, dict):
super(DataTypeMetaclass, cls).__init__(name, bases, dict)
if name in ('Char', 'DataType', 'Decimal', 'Float', 'Int', 'Number', 'Timestamp'):
cls.type = cls
else:
cls.type = cls.get_generic_type()
def __cmp__(cls, other):
if not isinstance(other, DataTypeMetaclass):
return -1
return cmp(
getattr(cls, 'CMP_VALUE', cls.__name__),
getattr(other, 'CMP_VALUE', other.__name__))
class DataType(ValExpr):
'''Base class for data types.
Data types are represented as classes so inheritance can be used.
'''
__metaclass__ = DataTypeMetaclass
@staticmethod
def group_by_type(vals):
'''Group cols by their data type and return a dict of the results.'''
vals_by_type = defaultdict(ValExprList)
for val in vals:
vals_by_type[val.type].append(val)
return vals_by_type
@classmethod
def get_base_type(cls):
'''This should only be called from a subclass to find the type that is just below
DataType in the class hierarchy. For example Int and Decimal would both return
Number as their base type.
'''
if DataType in cls.__bases__:
return cls
for base in cls.__bases__:
if issubclass(base, DataType):
return base.get_base_type()
raise Exception('Unable to determine base type of %s' % cls)
@classmethod
def get_generic_type(cls):
return cls.get_base_type()
@classmethod
def name(cls):
return cls.__name__
def __init__(self, val):
self.val = val
@property
def exact_type(self):
return type(self)
class Boolean(DataType):
pass
class Number(DataType):
pass
class Int(Number):
@classmethod
def get_generic_type(cls):
return Int
# Used to compare with other numbers for determining upcasting
CMP_VALUE = 2
# Used during data generation to keep vals in range
MIN = -2 ** 31
MAX = -MIN - 1
class TinyInt(Int):
CMP_VALUE = 0
MIN = -2 ** 7
MAX = -MIN - 1
class SmallInt(Int):
CMP_VALUE = 1
MIN = -2 ** 15
MAX = -MIN - 1
class BigInt(Int):
CMP_VALUE = 3
MIN = -2 ** 63
MAX = -MIN - 1
class Decimal(Number):
@classmethod
def get_generic_type(cls):
return Decimal
CMP_VALUE = 4
MAX_DIGITS = 38 # Arbitrary default values
MAX_FRACTIONAL_DIGITS = 10 # Arbitrary default values
class Float(Number):
@classmethod
def get_generic_type(cls):
return Float
CMP_VALUE = 5
class Double(Float):
CMP_VALUE = 6
class Char(DataType):
CMP_VALUE = 100
MIN = 0
MAX = 255 # This is not the true max
class VarChar(Char):
CMP_VALUE = 101
MAX = 255 # Not a true max. This is used to differentiate between VarChar and String.
class String(VarChar):
CMP_VALUE = 102
MIN = VarChar.MAX + 1 # This is used to differentiate between VarChar and String.
MAX = 1000 # This is not the true max.
class Timestamp(DataType):
pass
EXACT_TYPES = [
BigInt,
Boolean,
Char,
Decimal,
Double,
Float,
Int,
SmallInt,
String,
Timestamp,
TinyInt,
VarChar]
JOINABLE_TYPES = (Char, Decimal, Int, Timestamp)
TYPES = tuple(set(type_.type for type_ in EXACT_TYPES))
__DECIMAL_TYPE_CACHE = dict()
def get_decimal_class(total_digits, fractional_digits):
cache_key = (total_digits, fractional_digits)
if cache_key not in __DECIMAL_TYPE_CACHE:
__DECIMAL_TYPE_CACHE[cache_key] = type(
'Decimal%02d%02d' % (total_digits, fractional_digits),
(Decimal, ),
{'MAX_DIGITS': total_digits, 'MAX_FRACTIONAL_DIGITS': fractional_digits})
return __DECIMAL_TYPE_CACHE[cache_key]
__CHAR_TYPE_CACHE = dict()
def get_char_class(length):
if length not in __CHAR_TYPE_CACHE:
__CHAR_TYPE_CACHE[length] = type(
'Char%04d' % length,
(Char, ),
{'MAX': length})
return __CHAR_TYPE_CACHE[length]
__VARCHAR_TYPE_CACHE = dict()
def get_varchar_class(length):
if length not in __VARCHAR_TYPE_CACHE:
__VARCHAR_TYPE_CACHE[length] = type(
'VarChar%04d' % length,
(VarChar, ),
{'MAX': length})
return __VARCHAR_TYPE_CACHE[length]