mirror of
https://github.com/apache/impala.git
synced 2025-12-19 18:12:08 -05:00
The key facts here are: * --no-cache-dir is crucial because it prevents us pulling in a cached package compiled with the wrong compiler. * --no-binary takes a argument specifying the set of packages it should apply to. The latent bug was that we didn't provide an argument to --no-binary and it instead it took --no-index as the argument, which was a no-op because there are no packages of that name. IMPALA-6731 moved the arguments, and instead --no-cache-dir became the argument to --no-binary Testing: I could reliably reproduce the failure in my environment by deleting infra/python/env then running a test with impala-py.test. This patch is sufficient to solve it. Change-Id: I118738347ca537b2dddfa6142c3eb5608c49c2e0 Reviewed-on: http://gerrit.cloudera.org:8080/9829 Reviewed-by: Tim Armstrong <tarmstrong@cloudera.com> Tested-by: Impala Public Jenkins
393 lines
16 KiB
Python
393 lines
16 KiB
Python
# Licensed to the Apache Software Foundation (ASF) under one
|
|
# or more contributor license agreements. See the NOTICE file
|
|
# distributed with this work for additional information
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
# to you under the Apache License, Version 2.0 (the
|
|
# "License"); you may not use this file except in compliance
|
|
# with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
# This module will create a python virtual env and install external dependencies. If the
|
|
# virtualenv already exists and it contains all the expected packages, nothing is done.
|
|
#
|
|
# A multi-step bootstrapping process is required to build and install all of the
|
|
# dependencies:
|
|
# 1. install basic non-C/C++ packages into the virtualenv
|
|
# 1b. install packages that depend on step 1 but cannot be installed together with their
|
|
# dependencies
|
|
# 2. use the virtualenv Python to bootstrap the toolchain
|
|
# 3. use toolchain gcc to build C/C++ packages
|
|
# 4. build the kudu-python package with toolchain gcc and Cython
|
|
#
|
|
# Every time this script is run, it completes as many of the bootstrapping steps as
|
|
# possible with the available dependencies.
|
|
#
|
|
# This module can be run with python >= 2.4 but python >= 2.6 must be installed on the
|
|
# system. If the default 'python' command refers to < 2.6, python 2.6 will be used
|
|
# instead.
|
|
|
|
import glob
|
|
import logging
|
|
import optparse
|
|
import os
|
|
import shutil
|
|
import subprocess
|
|
import sys
|
|
import tarfile
|
|
import tempfile
|
|
import textwrap
|
|
import urllib
|
|
|
|
LOG = logging.getLogger(os.path.splitext(os.path.basename(__file__))[0])
|
|
|
|
DEPS_DIR = os.path.join(os.path.dirname(__file__), "deps")
|
|
ENV_DIR = os.path.join(os.path.dirname(__file__), "env")
|
|
|
|
# Requirements file with packages we need for our build and tests.
|
|
REQS_PATH = os.path.join(DEPS_DIR, "requirements.txt")
|
|
|
|
# Second stage of requirements which cannot be installed together with their dependencies
|
|
# in requirements.txt.
|
|
REQS2_PATH = os.path.join(DEPS_DIR, "stage2-requirements.txt")
|
|
|
|
# Requirements for the next bootstrapping step that builds compiled requirements
|
|
# with toolchain gcc.
|
|
COMPILED_REQS_PATH = os.path.join(DEPS_DIR, "compiled-requirements.txt")
|
|
|
|
# Requirements for the Kudu bootstrapping step, which depends on Cython being installed
|
|
# by the compiled requirements step.
|
|
KUDU_REQS_PATH = os.path.join(DEPS_DIR, "kudu-requirements.txt")
|
|
|
|
# Requirements for the ADLS test client step, which depends on Cffi (C Foreign Function
|
|
# Interface) being installed by the compiled requirements step.
|
|
ADLS_REQS_PATH = os.path.join(DEPS_DIR, "adls-requirements.txt")
|
|
|
|
def delete_virtualenv_if_exist():
|
|
if os.path.exists(ENV_DIR):
|
|
shutil.rmtree(ENV_DIR)
|
|
|
|
|
|
def create_virtualenv():
|
|
LOG.info("Creating python virtualenv")
|
|
build_dir = tempfile.mkdtemp()
|
|
file = tarfile.open(find_file(DEPS_DIR, "virtualenv*.tar.gz"), "r:gz")
|
|
for member in file.getmembers():
|
|
file.extract(member, build_dir)
|
|
file.close()
|
|
python_cmd = detect_python_cmd()
|
|
exec_cmd([python_cmd, find_file(build_dir, "virtualenv*", "virtualenv.py"), "--quiet",
|
|
"--python", python_cmd, ENV_DIR])
|
|
shutil.rmtree(build_dir)
|
|
|
|
|
|
def exec_cmd(args, **kwargs):
|
|
'''Executes a command and waits for it to finish, raises an exception if the return
|
|
status is not zero. The command output is returned.
|
|
|
|
'args' and 'kwargs' use the same format as subprocess.Popen().
|
|
'''
|
|
process = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
|
|
**kwargs)
|
|
output = process.communicate()[0]
|
|
if process.returncode != 0:
|
|
raise Exception("Command returned non-zero status\nCommand: %s\nOutput: %s"
|
|
% (args, output))
|
|
return output
|
|
|
|
def use_ccache():
|
|
'''Returns true if ccache is available and should be used'''
|
|
if 'DISABLE_CCACHE' in os.environ: return False
|
|
try:
|
|
exec_cmd(['ccache', '-V'])
|
|
return True
|
|
except:
|
|
return False
|
|
|
|
def select_cc():
|
|
'''Return the C compiler command that should be used as a string or None if the
|
|
compiler is not available '''
|
|
# Use toolchain gcc for ABI compatibility with other toolchain packages, e.g.
|
|
# Kudu/kudu-python
|
|
if not have_toolchain(): return None
|
|
toolchain_gcc_dir = toolchain_pkg_dir("gcc")
|
|
cc = os.path.join(toolchain_gcc_dir, "bin/gcc")
|
|
if not os.path.exists(cc): return None
|
|
if use_ccache(): cc = "ccache %s" % cc
|
|
return cc
|
|
|
|
def exec_pip_install(args, cc="no-cc-available", env=None):
|
|
'''Executes "pip install" with the provided command line arguments. If 'cc' is set,
|
|
it is used as the C compiler. Otherwise compilation of C/C++ code is disabled by
|
|
setting the CC environment variable to a bogus value.
|
|
Other environment vars can optionally be set with the 'env' argument. By default the
|
|
current process's command line arguments are inherited.'''
|
|
if not env: env = dict(os.environ)
|
|
env["CC"] = cc
|
|
|
|
# Parallelize the slow numpy build.
|
|
# Use getconf instead of nproc because it is supported more widely, e.g. on older
|
|
# linux distributions.
|
|
env["NPY_NUM_BUILD_JOBS"] = exec_cmd(["getconf", "_NPROCESSORS_ONLN"]).strip()
|
|
|
|
# Don't call the virtualenv pip directly, it uses a hashbang to to call the python
|
|
# virtualenv using an absolute path. If the path to the virtualenv is very long, the
|
|
# hashbang won't work.
|
|
#
|
|
# Passes --no-binary for IMPALA-3767: without this, Cython (and
|
|
# several other packages) fail download.
|
|
#
|
|
# --no-cache-dir is used to prevent caching of compiled artifacts, which may be built
|
|
# with different compilers or settings.
|
|
cmd = [os.path.join(ENV_DIR, "bin", "python"), os.path.join(ENV_DIR, "bin", "pip"),
|
|
"install", "-v", "--no-binary", ":all:", "--no-cache-dir"]
|
|
|
|
# When using a custom mirror, we also must use the index of that mirror.
|
|
if "PYPI_MIRROR" in os.environ:
|
|
cmd.extend(["--index-url", "%s/simple" % os.environ["PYPI_MIRROR"]])
|
|
else:
|
|
# Prevent fetching additional packages from the index. If we forget to add a package
|
|
# to one of the requirements.txt files, this should trigger an error. However, we will
|
|
# still access the index for version/dependency resolution, hence we need to change it
|
|
# when using a private mirror.
|
|
cmd.append("--no-index")
|
|
|
|
cmd.extend(["--find-links",
|
|
"file://%s" % urllib.pathname2url(os.path.abspath(DEPS_DIR))])
|
|
cmd.extend(args)
|
|
exec_cmd(cmd, env=env)
|
|
|
|
|
|
def find_file(*paths):
|
|
'''Returns the path specified by the glob 'paths', raises an exception if no file is
|
|
found.
|
|
|
|
Ex: find_file('/etc', 'h*sts') --> /etc/hosts
|
|
'''
|
|
path = os.path.join(*paths)
|
|
files = glob.glob(path)
|
|
if len(files) > 1:
|
|
raise Exception("Found too many files at %s: %s" % (path, files))
|
|
if len(files) == 0:
|
|
raise Exception("No file found at %s" % path)
|
|
return files[0]
|
|
|
|
|
|
def detect_python_cmd():
|
|
'''Returns the system command that provides python 2.6 or greater.'''
|
|
paths = os.getenv("PATH").split(os.path.pathsep)
|
|
for cmd in ("python", "python27", "python2.7", "python-27", "python-2.7", "python26",
|
|
"python2.6", "python-26", "python-2.6"):
|
|
for path in paths:
|
|
cmd_path = os.path.join(path, cmd)
|
|
if not os.path.exists(cmd_path) or not os.access(cmd_path, os.X_OK):
|
|
continue
|
|
exit = subprocess.call([cmd_path, "-c", textwrap.dedent("""
|
|
import sys
|
|
sys.exit(int(sys.version_info[:2] < (2, 6)))""")])
|
|
if exit == 0:
|
|
return cmd_path
|
|
raise Exception("Could not find minimum required python version 2.6")
|
|
|
|
|
|
def install_deps():
|
|
LOG.info("Installing packages into the virtualenv")
|
|
exec_pip_install(["-r", REQS_PATH])
|
|
mark_reqs_installed(REQS_PATH)
|
|
LOG.info("Installing stage 2 packages into the virtualenv")
|
|
exec_pip_install(["-r", REQS2_PATH])
|
|
mark_reqs_installed(REQS2_PATH)
|
|
|
|
def have_toolchain():
|
|
'''Return true if the Impala toolchain is available'''
|
|
return "IMPALA_TOOLCHAIN" in os.environ
|
|
|
|
def toolchain_pkg_dir(pkg_name):
|
|
'''Return the path to the toolchain package'''
|
|
pkg_version = os.environ["IMPALA_" + pkg_name.upper() + "_VERSION"]
|
|
return os.path.join(os.environ["IMPALA_TOOLCHAIN"], pkg_name + "-" + pkg_version)
|
|
|
|
def install_compiled_deps_if_possible():
|
|
'''Install dependencies that require compilation with toolchain GCC, if the toolchain
|
|
is available. Returns true if the deps are installed'''
|
|
if reqs_are_installed(COMPILED_REQS_PATH):
|
|
LOG.debug("Skipping compiled deps: matching compiled-installed-requirements.txt found")
|
|
return True
|
|
cc = select_cc()
|
|
if cc is None:
|
|
LOG.debug("Skipping compiled deps: cc not available yet")
|
|
return False
|
|
|
|
env = dict(os.environ)
|
|
|
|
# Compilation of pycrypto fails on CentOS 5 with newer GCC versions because of a
|
|
# problem with inline declarations in older libc headers. Setting -fgnu89-inline is a
|
|
# workaround.
|
|
distro_version = ''.join(exec_cmd(["lsb_release", "-irs"]).lower().split())
|
|
print distro_version
|
|
if distro_version.startswith("centos5."):
|
|
env["CFLAGS"] = "-fgnu89-inline"
|
|
|
|
LOG.info("Installing compiled requirements into the virtualenv")
|
|
exec_pip_install(["-r", COMPILED_REQS_PATH], cc=cc, env=env)
|
|
mark_reqs_installed(COMPILED_REQS_PATH)
|
|
return True
|
|
|
|
def install_adls_deps():
|
|
# The ADLS dependencies require that the OS is at least CentOS 6.7 or above,
|
|
# which is why we break this into a seperate step. If the target filesystem is
|
|
# ADLS, the expectation is that the dev environment is running at least CentOS 6.7.
|
|
if os.environ.get('TARGET_FILESYSTEM') == "adls":
|
|
if reqs_are_installed(ADLS_REQS_PATH):
|
|
LOG.debug("Skipping ADLS deps: matching adls-installed-requirements.txt found")
|
|
return True
|
|
cc = select_cc()
|
|
assert cc is not None
|
|
LOG.info("Installing ADLS packages into the virtualenv")
|
|
exec_pip_install(["-r", ADLS_REQS_PATH], cc=cc)
|
|
mark_reqs_installed(ADLS_REQS_PATH)
|
|
|
|
def install_kudu_client_if_possible():
|
|
'''Installs the Kudu python module if possible, which depends on the toolchain and
|
|
the compiled requirements in compiled-requirements.txt. If the toolchain isn't
|
|
available, nothing will be done. Also nothing will be done if the Kudu client lib
|
|
required by the module isn't available (as determined by KUDU_IS_SUPPORTED)'''
|
|
if reqs_are_installed(KUDU_REQS_PATH):
|
|
LOG.debug("Skipping Kudu: matching kudu-installed-requirements.txt found")
|
|
return
|
|
if os.environ["KUDU_IS_SUPPORTED"] != "true":
|
|
LOG.debug("Skipping Kudu: Kudu is not supported")
|
|
return
|
|
if not have_toolchain():
|
|
LOG.debug("Skipping Kudu: IMPALA_TOOLCHAIN not set")
|
|
return
|
|
toolchain_kudu_dir = toolchain_pkg_dir("kudu")
|
|
if not os.path.exists(toolchain_kudu_dir):
|
|
LOG.debug("Skipping Kudu: %s doesn't exist" % toolchain_kudu_dir)
|
|
return
|
|
|
|
LOG.info("Installing Kudu into the virtualenv")
|
|
# The installation requires that KUDU_HOME/build/latest exists. An empty directory
|
|
# structure will be made to satisfy that. The Kudu client headers and lib will be made
|
|
# available through GCC environment variables.
|
|
fake_kudu_build_dir = os.path.join(tempfile.gettempdir(), "virtualenv-kudu")
|
|
try:
|
|
artifact_dir = os.path.join(fake_kudu_build_dir, "build", "latest")
|
|
if not os.path.exists(artifact_dir):
|
|
os.makedirs(artifact_dir)
|
|
cc = select_cc()
|
|
assert cc is not None
|
|
env = dict(os.environ)
|
|
env["KUDU_HOME"] = fake_kudu_build_dir
|
|
kudu_client_dir = find_kudu_client_install_dir()
|
|
env["CPLUS_INCLUDE_PATH"] = os.path.join(kudu_client_dir, "include")
|
|
env["LIBRARY_PATH"] = os.path.pathsep.join([os.path.join(kudu_client_dir, 'lib'),
|
|
os.path.join(kudu_client_dir, 'lib64')])
|
|
exec_pip_install(["-r", KUDU_REQS_PATH], cc=cc, env=env)
|
|
mark_reqs_installed(KUDU_REQS_PATH)
|
|
finally:
|
|
try:
|
|
shutil.rmtree(fake_kudu_build_dir)
|
|
except Exception:
|
|
LOG.debug("Error removing temp Kudu build dir", exc_info=True)
|
|
|
|
|
|
def find_kudu_client_install_dir():
|
|
custom_client_dir = os.environ["KUDU_CLIENT_DIR"]
|
|
if custom_client_dir:
|
|
install_dir = os.path.join(custom_client_dir, "usr", "local")
|
|
error_if_kudu_client_not_found(install_dir)
|
|
else:
|
|
# If the toolchain appears to have been setup already, then the Kudu client is
|
|
# required to exist. It's possible that the toolchain won't be setup yet though
|
|
# since the toolchain bootstrap script depends on the virtualenv.
|
|
kudu_base_dir = os.path.join(os.environ["IMPALA_TOOLCHAIN"],
|
|
"kudu-%s" % os.environ["IMPALA_KUDU_VERSION"])
|
|
install_dir = os.path.join(kudu_base_dir, "debug")
|
|
if os.path.exists(kudu_base_dir):
|
|
error_if_kudu_client_not_found(install_dir)
|
|
return install_dir
|
|
|
|
|
|
def error_if_kudu_client_not_found(install_dir):
|
|
header_path = os.path.join(install_dir, "include", "kudu", "client", "client.h")
|
|
if not os.path.exists(header_path):
|
|
raise Exception("Kudu client header not found at %s" % header_path)
|
|
|
|
kudu_client_lib = "libkudu_client.so"
|
|
lib_dir = os.path.join(install_dir, "lib64")
|
|
if not os.path.exists(lib_dir):
|
|
lib_dir = os.path.join(install_dir, "lib")
|
|
for _, _, files in os.walk(lib_dir):
|
|
for file in files:
|
|
if file == kudu_client_lib:
|
|
return
|
|
raise Exception("%s not found at %s" % (kudu_client_lib, lib_dir))
|
|
|
|
def mark_reqs_installed(reqs_path):
|
|
'''Mark that the requirements from the given file are installed by copying it into the root
|
|
directory of the virtualenv.'''
|
|
installed_reqs_path = os.path.join(ENV_DIR, os.path.basename(reqs_path))
|
|
shutil.copyfile(reqs_path, installed_reqs_path)
|
|
|
|
def reqs_are_installed(reqs_path):
|
|
'''Check if the requirements from the given file are installed in the virtualenv by
|
|
looking for a matching requirements file in the root directory of the virtualenv.'''
|
|
installed_reqs_path = os.path.join(ENV_DIR, os.path.basename(reqs_path))
|
|
if not os.path.exists(installed_reqs_path):
|
|
return False
|
|
installed_reqs_file = open(installed_reqs_path)
|
|
try:
|
|
reqs_file = open(reqs_path)
|
|
try:
|
|
if reqs_file.read() == installed_reqs_file.read():
|
|
return True
|
|
else:
|
|
LOG.debug("Virtualenv upgrade needed")
|
|
return False
|
|
finally:
|
|
reqs_file.close()
|
|
finally:
|
|
installed_reqs_file.close()
|
|
|
|
def setup_virtualenv_if_not_exists():
|
|
if not (reqs_are_installed(REQS_PATH) and reqs_are_installed(REQS2_PATH)):
|
|
delete_virtualenv_if_exist()
|
|
create_virtualenv()
|
|
install_deps()
|
|
LOG.debug("Virtualenv setup complete")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = optparse.OptionParser()
|
|
parser.add_option("-l", "--log-level", default="INFO",
|
|
choices=("DEBUG", "INFO", "WARN", "ERROR"))
|
|
parser.add_option("-r", "--rebuild", action="store_true", help="Force a rebuild of"
|
|
" the virtualenv even if it exists and appears to be completely up-to-date.")
|
|
parser.add_option("--print-ld-library-path", action="store_true", help="Print the"
|
|
" LD_LIBRARY_PATH that should be used when running python from the virtualenv.")
|
|
options, args = parser.parse_args()
|
|
|
|
if options.print_ld_library_path:
|
|
kudu_client_dir = find_kudu_client_install_dir()
|
|
print os.path.pathsep.join([os.path.join(kudu_client_dir, 'lib'),
|
|
os.path.join(kudu_client_dir, 'lib64')])
|
|
sys.exit()
|
|
|
|
logging.basicConfig(level=getattr(logging, options.log_level))
|
|
if options.rebuild:
|
|
delete_virtualenv_if_exist()
|
|
|
|
# Complete as many bootstrap steps as possible (see file comment for the steps).
|
|
setup_virtualenv_if_not_exists()
|
|
if install_compiled_deps_if_possible():
|
|
install_kudu_client_if_possible()
|
|
install_adls_deps()
|