mirror of
https://github.com/apache/impala.git
synced 2026-01-08 12:02:54 -05:00
This patch now requires users of the Leopard framework to supply a DOCKER_IMAGE_NAME. The cloudera/impala image isn't being maintained, and until Apache Impala (incubating) decides to publish its own image, no default image name is possible. It is still possible to run the Leopard framework against a homegrown Docker image that has Apache Impala (incubating) and PostgreSQL installed: simply build such an image and export the DOCKER_IMAGE_NAME environment variable before running the controller. While here, fix some flake8 non-indent problems. Testing: short Leopard controller / query generator run. Change-Id: Ic1cb96cb5c9a894f40e0892f3bdd3f3d0158e887 Reviewed-on: http://gerrit.cloudera.org:8080/4936 Reviewed-by: Matthew Jacobs <mj@cloudera.com> Reviewed-by: Jim Apple <jbapple@cloudera.com> Reviewed-by: Taras Bobrovytsky <tbobrovytsky@cloudera.com> Tested-by: Jim Apple <jbapple@cloudera.com>
335 lines
13 KiB
Python
Executable File
335 lines
13 KiB
Python
Executable File
# Licensed to the Apache Software Foundation (ASF) under one
|
|
# or more contributor license agreements. See the NOTICE file
|
|
# distributed with this work for additional information
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
# to you under the Apache License, Version 2.0 (the
|
|
# "License"); you may not use this file except in compliance
|
|
# with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
'''This module generates a docker environment for a job'''
|
|
|
|
from __future__ import division
|
|
from fabric.api import sudo, run, settings
|
|
from logging import getLogger
|
|
from os.path import (
|
|
join as join_path,
|
|
normpath)
|
|
from time import sleep
|
|
from tests.comparison.leopard.controller import (
|
|
SHOULD_BUILD_IMPALA,
|
|
SHOULD_LOAD_DATA,
|
|
SHOULD_PULL_DOCKER_IMAGE)
|
|
import random
|
|
import os
|
|
|
|
IMPALA_HOME = '/home/dev/Impala'
|
|
CORE_PATH = '/tmp/core_files'
|
|
DEFAULT_BRANCH_NAME = os.environ.get('DEFAULT_BRANCH_NAME', 'origin/master')
|
|
DOCKER_USER_NAME = os.environ.get('DOCKER_USER_NAME', 'dev')
|
|
|
|
# Needed for ensuring the testdata volume is properly owned. The UID/GID from the
|
|
# container must be used, not symbolic name.
|
|
DOCKER_IMPALA_USER_UID = int(os.environ.get(
|
|
'DOCKER_IMPALA_USER_UID', 1234))
|
|
DOCKER_IMPALA_USER_GID = int(os.environ.get(
|
|
'DOCKER_IMPALA_USER_GID', 1000))
|
|
|
|
HOST_TESTDATA_EXTERNAL_VOLUME_PATH = normpath(os.environ.get(
|
|
'HOST_TESTDATA_EXTERNAL_VOLUME_PATH',
|
|
os.path.sep + join_path('data', '1', 'dockervols', 'cluster')))
|
|
|
|
DEFAULT_DOCKER_TESTDATA_VOLUME_PATH = os.path.sep + join_path(
|
|
'home', DOCKER_USER_NAME, 'Impala', 'testdata', 'cluster')
|
|
|
|
# This needs to have a trailing os.path.sep for rsync so that the contents of the rsync
|
|
# source will be put directly into this directory. man rsync to understand the
|
|
# more idiosyncracies of trailling / (or not) in paths.
|
|
DOCKER_TESTDATA_VOLUME_PATH = normpath(
|
|
os.environ.get(
|
|
'DOCKER_TESTDATA_VOLUME_PATH',
|
|
DEFAULT_DOCKER_TESTDATA_VOLUME_PATH)
|
|
) + os.path.sep
|
|
|
|
HOST_TO_DOCKER_SSH_KEY = os.environ.get(
|
|
'HOST_TO_DOCKER_SSH_KEY',
|
|
join_path(os.environ['HOME'], '.ssh', 'ro-rsync_rsa'))
|
|
|
|
NUM_START_ATTEMPTS = 50
|
|
NUM_FABRIC_ATTEMPTS = 50
|
|
|
|
LOG = getLogger('ImpalaDockerEnv')
|
|
|
|
|
|
def retry(func):
|
|
'''Retry decorator.'''
|
|
|
|
def wrapper(*args, **kwargs):
|
|
attempt_num = 0
|
|
while True:
|
|
attempt_num += 1
|
|
try:
|
|
return func(*args, **kwargs)
|
|
except:
|
|
LOG.exception('{0} exception [{1}] (try: {2})'.format(
|
|
func.__name__, args[0], attempt_num))
|
|
if attempt_num == NUM_FABRIC_ATTEMPTS:
|
|
raise
|
|
sleep_time = random.randint(1, attempt_num)
|
|
sleep(sleep_time)
|
|
|
|
return wrapper
|
|
|
|
|
|
class ImpalaDockerEnv(object):
|
|
'''Represents an Impala environemnt inside a Docker container. Used for starting
|
|
Impala, getting stack traces after a crash and keeping track of the ports on which SSH,
|
|
Postgres and Impala are running.
|
|
'''
|
|
|
|
def __init__(self, git_command):
|
|
self.ssh_port = None
|
|
self.impala_port = None
|
|
self.postgres_port = None
|
|
self.container_id = None
|
|
self.git_command = git_command
|
|
self.host = os.environ['TARGET_HOST']
|
|
self.host_username = os.environ['TARGET_HOST_USERNAME']
|
|
self.docker_image_name = os.environ['DOCKER_IMAGE_NAME']
|
|
|
|
def stop_docker(self):
|
|
with settings(warn_only=True, host_string=self.host, user=self.host_username):
|
|
retry(sudo)('docker stop {0}'.format(self.container_id), pty=True)
|
|
retry(sudo)('docker rm {0}'.format(self.container_id), pty=True)
|
|
|
|
def start_new_container(self, volume_map=None):
|
|
"""
|
|
Starts a container with port forwarding for ssh, impala and postgres.
|
|
|
|
The optional volume_map is a dictionary for making use of Docker external volumes.
|
|
The keys are paths on the host, and the values are paths on the container.
|
|
"""
|
|
for _ in range(NUM_START_ATTEMPTS):
|
|
with settings(warn_only=True, host_string=self.host, user=self.host_username):
|
|
set_core_dump_location_command = \
|
|
"echo '/tmp/core_files/core.%e.%p' | sudo tee /proc/sys/kernel/core_pattern"
|
|
sudo(set_core_dump_location_command, pty=True)
|
|
port = random.randint(0, 999)
|
|
self.ssh_port = 55000 + port
|
|
self.impala_port = 56000 + port
|
|
self.postgres_port = 57000 + port
|
|
|
|
start_command = ''
|
|
if SHOULD_PULL_DOCKER_IMAGE:
|
|
start_command = 'docker pull {docker_image_name} && '.format(
|
|
docker_image_name=self.docker_image_name)
|
|
volume_ops = ''
|
|
if volume_map is not None:
|
|
volume_ops = ' '.join(
|
|
['-v {host_path}:{container_path}'.format(host_path=host_path,
|
|
container_path=container_path)
|
|
for host_path, container_path in volume_map.iteritems()])
|
|
start_command += (
|
|
'docker run -d -t {volume_ops} -p {postgres_port}:5432 -p {ssh_port}:22 '
|
|
'-p {impala_port}:21050 {docker_image_name} /bin/docker-boot-daemon').format(
|
|
volume_ops=volume_ops,
|
|
ssh_port=self.ssh_port,
|
|
impala_port=self.impala_port,
|
|
postgres_port=self.postgres_port,
|
|
docker_image_name=self.docker_image_name)
|
|
|
|
try:
|
|
self.container_id = sudo(start_command, pty=True)
|
|
except Exception as e:
|
|
LOG.exception('start_new_container:' + str(e))
|
|
if self.container_id is not None:
|
|
break
|
|
else:
|
|
LOG.error('Container failed to start after {0} attempts'.format(NUM_START_ATTEMPTS))
|
|
# Wait for the SSH service to start inside the docker instance. Usually takes 1
|
|
# second. This is simple and reliable. An alternative implementation is to poll with
|
|
# timeout if SSH was started.
|
|
sleep(10)
|
|
|
|
def get_git_hash(self):
|
|
'''Returns Git hash if the current commit. '''
|
|
with settings(
|
|
warn_only=True,
|
|
host_string='{0}@{1}:{2}'.format(DOCKER_USER_NAME, self.host, self.ssh_port),
|
|
password=os.environ['DOCKER_PASSWORD']
|
|
):
|
|
git_hash = retry(run)('cd {IMPALA_HOME} && git rev-parse --short HEAD'.format(
|
|
IMPALA_HOME=IMPALA_HOME))
|
|
return git_hash
|
|
|
|
def run_all(self):
|
|
with settings(
|
|
warn_only=True,
|
|
host_string='{0}@{1}:{2}'.format(DOCKER_USER_NAME, self.host, self.ssh_port),
|
|
password=os.environ['DOCKER_PASSWORD']
|
|
):
|
|
run_all_command = (
|
|
'mkdir -p {CORE_PATH} && chmod 777 {CORE_PATH} && cd {IMPALA_HOME} '
|
|
'&& source {IMPALA_HOME}/bin/impala-config.sh '
|
|
'&& {IMPALA_HOME}/bin/create-test-configuration.sh '
|
|
'&& {IMPALA_HOME}/testdata/bin/run-all.sh').format(
|
|
CORE_PATH=CORE_PATH,
|
|
IMPALA_HOME=IMPALA_HOME)
|
|
retry(run)(run_all_command, pty=False)
|
|
|
|
def build_impala(self):
|
|
'''Fetches and Builds Impala. If git_command is not present the latest version is
|
|
fetched by default. '''
|
|
|
|
build_command = None
|
|
if self.git_command:
|
|
build_command = (
|
|
'docker-boot && cd {IMPALA_HOME} && {git_command} '
|
|
'&& source {IMPALA_HOME}/bin/impala-config.sh '
|
|
'&& {IMPALA_HOME}/buildall.sh -notests').format(
|
|
git_command=self.git_command,
|
|
IMPALA_HOME=IMPALA_HOME)
|
|
elif SHOULD_BUILD_IMPALA:
|
|
build_command = (
|
|
'docker-boot && cd {IMPALA_HOME} '
|
|
'&& git fetch --all && git checkout {DEFAULT_BRANCH_NAME} '
|
|
'&& source {IMPALA_HOME}/bin/impala-config.sh '
|
|
'&& {IMPALA_HOME}/buildall.sh -notests').format(
|
|
IMPALA_HOME=IMPALA_HOME,
|
|
DEFAULT_BRANCH_NAME=DEFAULT_BRANCH_NAME)
|
|
|
|
if build_command:
|
|
with settings(
|
|
warn_only=True,
|
|
host_string='{0}@{1}:{2}'.format(DOCKER_USER_NAME, self.host, self.ssh_port),
|
|
password=os.environ['DOCKER_PASSWORD']
|
|
):
|
|
result = retry(run)(build_command, pty=False)
|
|
LOG.info('Build Complete, Result: {0}'.format(result))
|
|
|
|
def load_data(self):
|
|
if SHOULD_LOAD_DATA:
|
|
with settings(
|
|
warn_only=True,
|
|
host_string='{0}@{1}:{2}'.format(DOCKER_USER_NAME, self.host, self.ssh_port),
|
|
password=os.environ['DOCKER_PASSWORD']
|
|
):
|
|
self.start_impala()
|
|
load_command = '''cd {IMPALA_HOME} \
|
|
&& source bin/impala-config.sh \
|
|
&& ./tests/comparison/data_generator.py \
|
|
--use-postgresql --db-name=functional \
|
|
--migrate-table-names=alltypes,alltypestiny,alltypesagg migrate \
|
|
&& ./tests/comparison/data_generator.py --use-postgresql'''.format(
|
|
IMPALA_HOME=IMPALA_HOME)
|
|
result = retry(run)(load_command, pty=False)
|
|
return result
|
|
|
|
def start_impala(self):
|
|
with settings(
|
|
warn_only=True,
|
|
host_string='{0}@{1}:{2}'.format(DOCKER_USER_NAME, self.host, self.ssh_port),
|
|
password=os.environ['DOCKER_PASSWORD']
|
|
):
|
|
impalad_args = [
|
|
'-convert_legacy_hive_parquet_utc_timestamps=true',
|
|
]
|
|
start_command = (
|
|
'source {IMPALA_HOME}/bin/impala-config.sh '
|
|
'&& {IMPALA_HOME}/bin/start-impala-cluster.py '
|
|
'--impalad_args="{impalad_args}"').format(IMPALA_HOME=IMPALA_HOME,
|
|
impalad_args=' '.join(impalad_args))
|
|
result = retry(run)(start_command, pty=False)
|
|
return result
|
|
|
|
def is_impala_running(self):
|
|
'''Check that exactly 3 impalads are running inside the docker instance.'''
|
|
with settings(
|
|
warn_only=True,
|
|
host_string='{0}@{1}:{2}'.format(DOCKER_USER_NAME, self.host, self.ssh_port),
|
|
password=os.environ['DOCKER_PASSWORD']
|
|
):
|
|
return retry(run)('ps aux | grep impalad').count('/service/impalad') == 3
|
|
|
|
def get_stack(self):
|
|
'''Finds the newest core file and extracts the stack trace from it using gdb. '''
|
|
IMPALAD_PATH = '{IMPALA_HOME}/be/build/debug/service/impalad'.format(
|
|
IMPALA_HOME=IMPALA_HOME)
|
|
with settings(
|
|
warn_only=True,
|
|
host_string='{0}@{1}:{2}'.format(DOCKER_USER_NAME, self.host, self.ssh_port),
|
|
password=os.environ['DOCKER_PASSWORD']
|
|
):
|
|
core_file_name = retry(run)('ls {0} -t1 | head -1'.format(CORE_PATH))
|
|
LOG.info('Core File Name: {0}'.format(core_file_name))
|
|
if 'core' not in core_file_name:
|
|
return None
|
|
core_full_path = join_path(CORE_PATH, core_file_name)
|
|
stack_trace = retry(run)('gdb {0} {1} --batch --quiet --eval-command=bt'.format(
|
|
IMPALAD_PATH, core_full_path))
|
|
self.delete_core_files()
|
|
return stack_trace
|
|
|
|
def delete_core_files(self):
|
|
'''Delete all core files. This is usually done after the stack was extracted.'''
|
|
with settings(
|
|
warn_only=True,
|
|
host_string='{0}@{1}:{2}'.format(DOCKER_USER_NAME, self.host, self.ssh_port),
|
|
password=os.environ['DOCKER_PASSWORD']
|
|
):
|
|
retry(run)('rm -f {0}/core.*'.format(CORE_PATH))
|
|
|
|
def prepare(self):
|
|
'''Create a new Impala Environment. Starts a docker container and builds Impala in it.
|
|
'''
|
|
# See KUDU-1419: If we expect to be running Kudu in the minicluster inside the
|
|
# Docker container, we have to protect against storage engines like AUFS and their
|
|
# incompatibility with Kudu. First we have to get test data off the container, store
|
|
# it somewhere, and then start another container using docker -v and mount the test
|
|
# data as a volume to bypass AUFS. See also the README for Leopard.
|
|
if os.environ.get('KUDU_IS_SUPPORTED') == 'true':
|
|
LOG.info('Warming testdata cluster external volume')
|
|
self.start_new_container()
|
|
with settings(
|
|
warn_only=True,
|
|
host_string=self.host,
|
|
user=self.host_username,
|
|
):
|
|
sudo(
|
|
'mkdir -p {host_testdata_path} && '
|
|
'rsync -e "ssh -i {priv_key} -o StrictHostKeyChecking=no '
|
|
'' '-o UserKnownHostsFile=/dev/null -p {ssh_port}" '
|
|
'--delete --archive --verbose --progress --chown={uid}:{gid} '
|
|
'{user}@127.0.0.1:{container_testdata_path} {host_testdata_path}'.format(
|
|
host_testdata_path=HOST_TESTDATA_EXTERNAL_VOLUME_PATH,
|
|
priv_key=HOST_TO_DOCKER_SSH_KEY,
|
|
ssh_port=self.ssh_port,
|
|
uid=DOCKER_IMPALA_USER_UID,
|
|
gid=DOCKER_IMPALA_USER_GID,
|
|
user=DOCKER_USER_NAME,
|
|
container_testdata_path=DOCKER_TESTDATA_VOLUME_PATH))
|
|
self.stop_docker()
|
|
volume_map = {
|
|
HOST_TESTDATA_EXTERNAL_VOLUME_PATH: DOCKER_TESTDATA_VOLUME_PATH,
|
|
}
|
|
else:
|
|
volume_map = None
|
|
|
|
self.start_new_container(volume_map=volume_map)
|
|
LOG.info('Container Started')
|
|
self.build_impala()
|
|
try:
|
|
result = self.run_all()
|
|
except Exception:
|
|
LOG.info('run_all exception')
|
|
LOG.info('Run All Complete, Result: {0}'.format(result))
|
|
self.load_data()
|