mirror of
https://github.com/apache/impala.git
synced 2026-01-05 03:01:02 -05:00
Change-Id: Ie89f140ed389cd877a84ffe2df892853ac9897f2 Reviewed-on: http://gerrit.cloudera.org:8080/1167 Reviewed-by: Taras Bobrovytsky <tbobrovytsky@cloudera.com> Tested-by: Internal Jenkins
184 lines
5.7 KiB
Python
Executable File
184 lines
5.7 KiB
Python
Executable File
#!/usr/bin/env impala-python
|
|
# Copyright (c) 2015 Cloudera, Inc. All rights reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
from random import choice, randint, random, shuffle
|
|
from os.path import join as join_path
|
|
from optparse import OptionParser
|
|
|
|
import json
|
|
|
|
MAX_NUM_STRUCT_FIELDS = 8
|
|
NULL_CHANCE = 0.5
|
|
SCALAR_TYPES = ['boolean', 'int', 'long', 'float', 'double', 'string']
|
|
|
|
class Node(object):
|
|
|
|
def __init__(self, num_fields, node_type):
|
|
self.node_type = node_type # one of struct, map, array
|
|
self.num_fields = num_fields
|
|
self.fields = []
|
|
|
|
|
|
class SchemaTreeGenerator(object):
|
|
|
|
def __init__(self, target_num_scalars=10, target_depth=3):
|
|
self._target_num_scalars = target_num_scalars
|
|
self._target_depth = target_depth
|
|
self._nodes = []
|
|
self._num_scalars_created = 0
|
|
self.root = None
|
|
|
|
def _create_random_node(self):
|
|
node_type = choice(('map', 'array', 'struct'))
|
|
if node_type in ('map', 'array'):
|
|
result_node = Node(1, node_type)
|
|
else:
|
|
num_fields = randint(1, MAX_NUM_STRUCT_FIELDS)
|
|
self._num_scalars_created += num_fields - 1
|
|
result_node = Node(num_fields, 'struct')
|
|
self._nodes.append(result_node)
|
|
return result_node
|
|
|
|
def _get_random_existing_node(self):
|
|
nodes = []
|
|
for node in self._nodes:
|
|
for _ in range(node.num_fields - len(node.fields)):
|
|
nodes.append(node)
|
|
return choice(nodes)
|
|
|
|
def _generate_rest(self):
|
|
while self._num_scalars_created < self._target_num_scalars:
|
|
node = self._get_random_existing_node()
|
|
node.fields.append(self._create_random_node())
|
|
self._finalize()
|
|
|
|
def _generate_trunk(self):
|
|
cur = self.root
|
|
for i in range(self._target_depth):
|
|
new_node = self._create_random_node()
|
|
self._nodes.append(new_node)
|
|
cur.fields.append(new_node)
|
|
cur = new_node
|
|
|
|
def _finalize(self):
|
|
for node in self._nodes:
|
|
for _ in range(node.num_fields - len(node.fields)):
|
|
node.fields.append(choice(SCALAR_TYPES))
|
|
shuffle(node.fields)
|
|
|
|
def create_tree(self):
|
|
self.root = Node(randint(1, MAX_NUM_STRUCT_FIELDS), 'struct')
|
|
self._nodes = [self.root]
|
|
self._num_scalars_created = self.root.num_fields
|
|
self._generate_trunk()
|
|
self._generate_rest()
|
|
return self.root
|
|
|
|
|
|
class AvroGenerator(object):
|
|
|
|
def __init__(self, schema_tree_generator):
|
|
self.cur_id = 0
|
|
self._schema_tree_generator = schema_tree_generator
|
|
|
|
def _next_id(self):
|
|
self.cur_id += 1
|
|
return str(self.cur_id)
|
|
|
|
def clear_state(self):
|
|
self.cur_id = 0
|
|
|
|
def create(self, table_name):
|
|
tree_root = self._schema_tree_generator.create_tree()
|
|
result = {}
|
|
result['type'] = 'record'
|
|
result['namespace'] = 'com.cloudera.impala'
|
|
result['name'] = table_name
|
|
result['fields'] = self._convert_struct_fields(tree_root.fields)
|
|
return result
|
|
|
|
def _convert_struct_fields(self, fields):
|
|
return [self._convert_struct_field(field) for field in fields]
|
|
|
|
def _convert_struct_field(self, struct_field_node):
|
|
result = {}
|
|
result['type'] = self._convert_node(struct_field_node)
|
|
result['name'] = 'field_' + self._next_id()
|
|
return result
|
|
|
|
def _convert_node(self, node):
|
|
if isinstance(node, str):
|
|
result = node
|
|
elif node.node_type == 'array':
|
|
result = self._convert_array(node)
|
|
elif node.node_type == 'map':
|
|
result = self._convert_map(node)
|
|
elif node.node_type == 'struct':
|
|
result = self._convert_struct(node)
|
|
else:
|
|
assert False, 'Unknown type: ' + node.node_types
|
|
if random() < NULL_CHANCE:
|
|
# Make it nullable
|
|
return ['null', result]
|
|
else:
|
|
return result
|
|
|
|
def _convert_array(self, array_node):
|
|
result = {}
|
|
result['type'] = 'array'
|
|
result['items'] = self._convert_node(array_node.fields[0])
|
|
return result
|
|
|
|
def _convert_map(self, map_node):
|
|
result = {}
|
|
result['type'] = 'map'
|
|
result['values'] = self._convert_node(map_node.fields[0])
|
|
return result
|
|
|
|
def _convert_struct(self, struct_node):
|
|
result = {}
|
|
result['type'] = 'record'
|
|
result['name'] = 'struct_' + self._next_id()
|
|
result['fields'] = self._convert_struct_fields(struct_node.fields)
|
|
return result
|
|
|
|
|
|
if __name__ == '__main__':
|
|
parser = OptionParser()
|
|
parser.add_option('--target_dir', default='/tmp',
|
|
help='Directory where the avro schemas will be saved.')
|
|
parser.add_option('--num_tables', default='4', type='int',
|
|
help='Number of schemas to generate.')
|
|
parser.add_option('--num_scalars', default='10', type='int',
|
|
help='Number of schemas to generate.')
|
|
parser.add_option('--nesting_depth', default='3', type='int',
|
|
help='Number of schemas to generate.')
|
|
parser.add_option('--base_table_name', default='table_',
|
|
help='Base table name.')
|
|
options, args = parser.parse_args()
|
|
|
|
schema_generator = SchemaTreeGenerator(target_num_scalars=options.num_scalars,
|
|
target_depth=options.nesting_depth)
|
|
writer = AvroGenerator(schema_generator)
|
|
|
|
for table_num in range(options.num_tables):
|
|
writer.clear_state()
|
|
table_name = options.base_table_name + str(table_num)
|
|
json_result = writer.create(table_name)
|
|
file_path = join_path(options.target_dir, table_name + '.avsc')
|
|
|
|
with open(file_path, 'w') as f:
|
|
json.dump(json_result, f, indent=2, sort_keys=True)
|