mirror of
https://github.com/apache/impala.git
synced 2026-01-07 09:02:19 -05:00
The major changes are: 1) Collect backtrace and fatal log on crash. 2) Poll memory usage. The data is only displayed at this time. 3) Support kerberos. 4) Add random queries. 5) Generate random and TPC-H nested data on a remote cluster. The random data generator was converted to use MR for scaling. 6) Add a cluster abstraction to run data loading for #5 on a remote or local cluster. This also moves and consolidates some Cloudera Manager utilities that were in the stress test. 7) Cleanup the wrappers around impyla. That stuff was getting messy. Change-Id: I4e4b72dbee1c867626a0b22291dd6462819e35d7 Reviewed-on: http://gerrit.cloudera.org:8080/1298 Reviewed-by: Casey Ching <casey@cloudera.com> Tested-by: Internal Jenkins
59 lines
2.1 KiB
Python
Executable File
59 lines
2.1 KiB
Python
Executable File
#!/bin/bash
|
|
# Copyright (c) 2015 Cloudera, Inc. All rights reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
# The line below is interpreted as an invalid command in bash and a string literal in
|
|
# python.
|
|
'''' &>/dev/null
|
|
set -e
|
|
# pypy is preferred since it's about 10x faster than cpython.
|
|
if which pypy &>/dev/null; then
|
|
exec pypy $0
|
|
else
|
|
exec python $0
|
|
fi
|
|
'''
|
|
|
|
'''This is a reducer for use with hadoop streaming. See data_generator.DatabasePopulator
|
|
for more information on how this file is used.
|
|
'''
|
|
|
|
import os
|
|
import random
|
|
import subprocess
|
|
import sys
|
|
|
|
# When running locally, the PYTHONPATH needed by impala-shell interferes with python
|
|
# through YARN. Specifically, when the data generator needs to import common.py, python
|
|
# looks at $IMPALA_HOME/tests/common and errors when it doesn't find what was asked for.
|
|
sys.path.insert(1, os.getcwd())
|
|
|
|
from data_generator_mapred_common import deserialize
|
|
|
|
for line in sys.stdin:
|
|
_, batch_idx, serialized_table_data_generator = line.split("\t")
|
|
table_data_generator = deserialize(serialized_table_data_generator)
|
|
random.seed(table_data_generator.randomization_seed)
|
|
output_file_name = "batch_%s.data" % batch_idx
|
|
with open(output_file_name, "w") as output_file:
|
|
table_data_generator.output_file = output_file
|
|
table_data_generator.populate_output_file()
|
|
put = subprocess.Popen(["hadoop", "fs", "-put", output_file.name,
|
|
table_data_generator.table.storage_location],
|
|
stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
|
|
put.wait()
|
|
if put.returncode != 0:
|
|
raise Exception("Error uploading data to hdfs: %s" % put.communicate()[0])
|
|
os.remove(output_file.name)
|