mirror of
https://github.com/apache/impala.git
synced 2025-12-30 12:02:10 -05:00
As part of the ASF transition, we need to replace references to Cloudera in Impala with references to Apache. This primarily means changing Java package names from com.cloudera.impala.* to org.apache.impala.* A prior patch renamed all the files as necessary, and this patch performs the actual code changes. Most of the changes in this patch were generated with some commands of the form: find . | grep "\.java\|\.py\|\.h\|\.cc" | \ xargs sed -i s/'com\(.\)cloudera\(\.\)impala/org\1apache\2impala/g along with some manual fixes. After this patch, the remaining references to Cloudera in the repo mostly fall into the categories: - External components that have cloudera in their own package names, eg. com.cloudera.kudu/llama - URLs, eg. https://repository.cloudera.com/ Change-Id: I0d35fa6602a7fc0c212b2ef5e2b3322b77dde7e2 Reviewed-on: http://gerrit.cloudera.org:8080/3937 Reviewed-by: Thomas Tauber-Marshall <tmarshall@cloudera.com> Reviewed-by: Jim Apple <jbapple@cloudera.com> Tested-by: Internal Jenkins
188 lines
5.9 KiB
Python
Executable File
188 lines
5.9 KiB
Python
Executable File
#!/usr/bin/env impala-python
|
|
#
|
|
# Licensed to the Apache Software Foundation (ASF) under one
|
|
# or more contributor license agreements. See the NOTICE file
|
|
# distributed with this work for additional information
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
# to you under the Apache License, Version 2.0 (the
|
|
# "License"); you may not use this file except in compliance
|
|
# with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
from random import choice, randint, random, shuffle
|
|
from os.path import join as join_path
|
|
from optparse import OptionParser
|
|
|
|
import json
|
|
|
|
MAX_NUM_STRUCT_FIELDS = 8
|
|
NULL_CHANCE = 0.5
|
|
SCALAR_TYPES = ['boolean', 'int', 'long', 'float', 'double', 'string']
|
|
|
|
class Node(object):
|
|
|
|
def __init__(self, num_fields, node_type):
|
|
self.node_type = node_type # one of struct, map, array
|
|
self.num_fields = num_fields
|
|
self.fields = []
|
|
|
|
|
|
class SchemaTreeGenerator(object):
|
|
|
|
def __init__(self, target_num_scalars=10, target_depth=3):
|
|
self._target_num_scalars = target_num_scalars
|
|
self._target_depth = target_depth
|
|
self._nodes = []
|
|
self._num_scalars_created = 0
|
|
self.root = None
|
|
|
|
def _create_random_node(self):
|
|
node_type = choice(('map', 'array', 'struct'))
|
|
if node_type in ('map', 'array'):
|
|
result_node = Node(1, node_type)
|
|
else:
|
|
num_fields = randint(1, MAX_NUM_STRUCT_FIELDS)
|
|
self._num_scalars_created += num_fields - 1
|
|
result_node = Node(num_fields, 'struct')
|
|
self._nodes.append(result_node)
|
|
return result_node
|
|
|
|
def _get_random_existing_node(self):
|
|
nodes = []
|
|
for node in self._nodes:
|
|
for _ in range(node.num_fields - len(node.fields)):
|
|
nodes.append(node)
|
|
return choice(nodes)
|
|
|
|
def _generate_rest(self):
|
|
while self._num_scalars_created < self._target_num_scalars:
|
|
node = self._get_random_existing_node()
|
|
node.fields.append(self._create_random_node())
|
|
self._finalize()
|
|
|
|
def _generate_trunk(self):
|
|
cur = self.root
|
|
for i in range(self._target_depth):
|
|
new_node = self._create_random_node()
|
|
self._nodes.append(new_node)
|
|
cur.fields.append(new_node)
|
|
cur = new_node
|
|
|
|
def _finalize(self):
|
|
for node in self._nodes:
|
|
for _ in range(node.num_fields - len(node.fields)):
|
|
node.fields.append(choice(SCALAR_TYPES))
|
|
shuffle(node.fields)
|
|
|
|
def create_tree(self):
|
|
self.root = Node(randint(1, MAX_NUM_STRUCT_FIELDS), 'struct')
|
|
self._nodes = [self.root]
|
|
self._num_scalars_created = self.root.num_fields
|
|
self._generate_trunk()
|
|
self._generate_rest()
|
|
return self.root
|
|
|
|
|
|
class AvroGenerator(object):
|
|
|
|
def __init__(self, schema_tree_generator):
|
|
self.cur_id = 0
|
|
self._schema_tree_generator = schema_tree_generator
|
|
|
|
def _next_id(self):
|
|
self.cur_id += 1
|
|
return str(self.cur_id)
|
|
|
|
def clear_state(self):
|
|
self.cur_id = 0
|
|
|
|
def create(self, table_name):
|
|
tree_root = self._schema_tree_generator.create_tree()
|
|
result = {}
|
|
result['type'] = 'record'
|
|
result['namespace'] = 'org.apache.impala'
|
|
result['name'] = table_name
|
|
result['fields'] = self._convert_struct_fields(tree_root.fields)
|
|
return result
|
|
|
|
def _convert_struct_fields(self, fields):
|
|
return [self._convert_struct_field(field) for field in fields]
|
|
|
|
def _convert_struct_field(self, struct_field_node):
|
|
result = {}
|
|
result['type'] = self._convert_node(struct_field_node)
|
|
result['name'] = 'field_' + self._next_id()
|
|
return result
|
|
|
|
def _convert_node(self, node):
|
|
if isinstance(node, str):
|
|
result = node
|
|
elif node.node_type == 'array':
|
|
result = self._convert_array(node)
|
|
elif node.node_type == 'map':
|
|
result = self._convert_map(node)
|
|
elif node.node_type == 'struct':
|
|
result = self._convert_struct(node)
|
|
else:
|
|
assert False, 'Unknown type: ' + node.node_types
|
|
if random() < NULL_CHANCE:
|
|
# Make it nullable
|
|
return ['null', result]
|
|
else:
|
|
return result
|
|
|
|
def _convert_array(self, array_node):
|
|
result = {}
|
|
result['type'] = 'array'
|
|
result['items'] = self._convert_node(array_node.fields[0])
|
|
return result
|
|
|
|
def _convert_map(self, map_node):
|
|
result = {}
|
|
result['type'] = 'map'
|
|
result['values'] = self._convert_node(map_node.fields[0])
|
|
return result
|
|
|
|
def _convert_struct(self, struct_node):
|
|
result = {}
|
|
result['type'] = 'record'
|
|
result['name'] = 'struct_' + self._next_id()
|
|
result['fields'] = self._convert_struct_fields(struct_node.fields)
|
|
return result
|
|
|
|
|
|
if __name__ == '__main__':
|
|
parser = OptionParser()
|
|
parser.add_option('--target_dir', default='/tmp',
|
|
help='Directory where the avro schemas will be saved.')
|
|
parser.add_option('--num_tables', default='4', type='int',
|
|
help='Number of schemas to generate.')
|
|
parser.add_option('--num_scalars', default='10', type='int',
|
|
help='Number of schemas to generate.')
|
|
parser.add_option('--nesting_depth', default='3', type='int',
|
|
help='Number of schemas to generate.')
|
|
parser.add_option('--base_table_name', default='table_',
|
|
help='Base table name.')
|
|
options, args = parser.parse_args()
|
|
|
|
schema_generator = SchemaTreeGenerator(target_num_scalars=options.num_scalars,
|
|
target_depth=options.nesting_depth)
|
|
writer = AvroGenerator(schema_generator)
|
|
|
|
for table_num in range(options.num_tables):
|
|
writer.clear_state()
|
|
table_name = options.base_table_name + str(table_num)
|
|
json_result = writer.create(table_name)
|
|
file_path = join_path(options.target_dir, table_name + '.avsc')
|
|
|
|
with open(file_path, 'w') as f:
|
|
json.dump(json_result, f, indent=2, sort_keys=True)
|