mirror of
https://github.com/apache/impala.git
synced 2025-12-19 18:12:08 -05:00
Historically Impala used the Python2 version that was available on the hosting platform, as long as that version was at least v2.6. This caused constant headache as all Python syntax had to be kept compatible with Python 2.6 (for Centos 6). It also caused a recent problem on Centos 8: here the system Python version was compiled with the system's GCC version (v8.3), which was much more recent than the Impala standard compiler version (GCC 4.9.2). When the Impala virtualenv was built, the system Python version supplied C compiler switches for models containing native code that were unknown for the Impala version of GCC, thus breaking virtualenv installation. This patch changes the Impala virtualenv to always use the Python2 version from the toolchain, which is built with the toolchain compiler. This ensures that - Impala always has a known Python 2.7 version for all its scripts, - virtualenv modules based on native code will always be installable, as the Python environment and the modules are built with the same compiler version. Additional changes: - Add an auto-use fixture to conftest.py to check that the tests are being run with Python 2.7.x - Make bootstrap_toolchain.py independent from the Impala virtualenv: remove the dependency on the "sh" library Tests: - Passed core-mode tests on CentOS 7.4 - Passed core-mode tests in Docker-based mode for centos:7 and ubuntu:16.04 Most content in this patch was developed but not published earlier by Tim Armstrong. Change-Id: Ic7b40cef89cfb3b467b61b2d54a94e708642882b Reviewed-on: http://gerrit.cloudera.org:8080/15624 Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
401 lines
16 KiB
Python
401 lines
16 KiB
Python
# Licensed to the Apache Software Foundation (ASF) under one
|
|
# or more contributor license agreements. See the NOTICE file
|
|
# distributed with this work for additional information
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
# to you under the Apache License, Version 2.0 (the
|
|
# "License"); you may not use this file except in compliance
|
|
# with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
# This module will create a python virtual env and install external dependencies. If the
|
|
# virtualenv already exists and it contains all the expected packages, nothing is done.
|
|
#
|
|
# A multi-step bootstrapping process is required to build and install all of the
|
|
# dependencies:
|
|
# 1. install basic non-C/C++ packages into the virtualenv
|
|
# 1b. install packages that depend on step 1 but cannot be installed together with their
|
|
# dependencies
|
|
# 2. use the virtualenv Python to bootstrap the toolchain
|
|
# 3. use toolchain gcc to build C/C++ packages
|
|
# 4. build the kudu-python package with toolchain gcc and Cython
|
|
#
|
|
# Every time this script is run, it completes as many of the bootstrapping steps as
|
|
# possible with the available dependencies.
|
|
#
|
|
# This module can be run with python >= 2.4 but python >= 2.6 must be installed on the
|
|
# system. If the default 'python' command refers to < 2.6, python 2.6 will be used
|
|
# instead.
|
|
|
|
from __future__ import print_function
|
|
import glob
|
|
import logging
|
|
import optparse
|
|
import os
|
|
import shutil
|
|
import subprocess
|
|
import sys
|
|
import tarfile
|
|
import tempfile
|
|
import textwrap
|
|
import urllib
|
|
from bootstrap_toolchain import ToolchainPackage
|
|
|
|
LOG = logging.getLogger(os.path.splitext(os.path.basename(__file__))[0])
|
|
|
|
DEPS_DIR = os.path.join(os.path.dirname(__file__), "deps")
|
|
ENV_DIR = os.path.join(os.path.dirname(__file__), "env")
|
|
|
|
# Requirements file with packages we need for our build and tests.
|
|
REQS_PATH = os.path.join(DEPS_DIR, "requirements.txt")
|
|
|
|
# Second stage of requirements which cannot be installed together with their dependencies
|
|
# in requirements.txt.
|
|
REQS2_PATH = os.path.join(DEPS_DIR, "stage2-requirements.txt")
|
|
|
|
# Requirements for the next bootstrapping step that builds compiled requirements
|
|
# with toolchain gcc.
|
|
COMPILED_REQS_PATH = os.path.join(DEPS_DIR, "compiled-requirements.txt")
|
|
|
|
# Requirements for the Kudu bootstrapping step, which depends on Cython being installed
|
|
# by the compiled requirements step.
|
|
KUDU_REQS_PATH = os.path.join(DEPS_DIR, "kudu-requirements.txt")
|
|
|
|
# Requirements for the ADLS test client step, which depends on Cffi (C Foreign Function
|
|
# Interface) being installed by the compiled requirements step.
|
|
ADLS_REQS_PATH = os.path.join(DEPS_DIR, "adls-requirements.txt")
|
|
|
|
def delete_virtualenv_if_exist():
|
|
if os.path.exists(ENV_DIR):
|
|
shutil.rmtree(ENV_DIR)
|
|
|
|
|
|
def create_virtualenv():
|
|
LOG.info("Creating python virtualenv")
|
|
build_dir = tempfile.mkdtemp()
|
|
file = tarfile.open(find_file(DEPS_DIR, "virtualenv*.tar.gz"), "r:gz")
|
|
for member in file.getmembers():
|
|
file.extract(member, build_dir)
|
|
file.close()
|
|
python_cmd = download_toolchain_python()
|
|
exec_cmd([python_cmd, find_file(build_dir, "virtualenv*", "virtualenv.py"), "--quiet",
|
|
"--python", python_cmd, ENV_DIR])
|
|
shutil.rmtree(build_dir)
|
|
|
|
|
|
def exec_cmd(args, **kwargs):
|
|
'''Executes a command and waits for it to finish, raises an exception if the return
|
|
status is not zero. The command output is returned.
|
|
|
|
'args' and 'kwargs' use the same format as subprocess.Popen().
|
|
'''
|
|
process = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
|
|
**kwargs)
|
|
output = process.communicate()[0]
|
|
if process.returncode != 0:
|
|
raise Exception("Command returned non-zero status\nCommand: %s\nOutput: %s"
|
|
% (args, output))
|
|
return output
|
|
|
|
def use_ccache():
|
|
'''Returns true if ccache is available and should be used'''
|
|
if 'DISABLE_CCACHE' in os.environ: return False
|
|
try:
|
|
exec_cmd(['ccache', '-V'])
|
|
return True
|
|
except:
|
|
return False
|
|
|
|
def select_cc():
|
|
'''Return the C compiler command that should be used as a string or None if the
|
|
compiler is not available '''
|
|
# Use toolchain gcc for ABI compatibility with other toolchain packages, e.g.
|
|
# Kudu/kudu-python
|
|
if not have_toolchain(): return None
|
|
toolchain_gcc_dir = toolchain_pkg_dir("gcc")
|
|
cc = os.path.join(toolchain_gcc_dir, "bin/gcc")
|
|
if not os.path.exists(cc): return None
|
|
if use_ccache(): cc = "ccache %s" % cc
|
|
return cc
|
|
|
|
def exec_pip_install(args, cc="no-cc-available", env=None):
|
|
'''Executes "pip install" with the provided command line arguments. If 'cc' is set,
|
|
it is used as the C compiler. Otherwise compilation of C/C++ code is disabled by
|
|
setting the CC environment variable to a bogus value.
|
|
Other environment vars can optionally be set with the 'env' argument. By default the
|
|
current process's command line arguments are inherited.'''
|
|
if not env: env = dict(os.environ)
|
|
env["CC"] = cc
|
|
|
|
# Parallelize the slow numpy build.
|
|
# Use getconf instead of nproc because it is supported more widely, e.g. on older
|
|
# linux distributions.
|
|
env["NPY_NUM_BUILD_JOBS"] = exec_cmd(["getconf", "_NPROCESSORS_ONLN"]).strip()
|
|
|
|
# Don't call the virtualenv pip directly, it uses a hashbang to to call the python
|
|
# virtualenv using an absolute path. If the path to the virtualenv is very long, the
|
|
# hashbang won't work.
|
|
impala_pip_base_cmd = [os.path.join(ENV_DIR, "bin", "python"),
|
|
os.path.join(ENV_DIR, "bin", "pip"), "install", "-v"]
|
|
|
|
# Passes --no-binary for IMPALA-3767: without this, Cython (and
|
|
# several other packages) fail download.
|
|
#
|
|
# --no-cache-dir is used to prevent caching of compiled artifacts, which may be built
|
|
# with different compilers or settings.
|
|
third_party_pkg_install_cmd = \
|
|
impala_pip_base_cmd[:] + ["--no-binary", ":all:", "--no-cache-dir"]
|
|
|
|
# When using a custom mirror, we also must use the index of that mirror.
|
|
if "PYPI_MIRROR" in os.environ:
|
|
third_party_pkg_install_cmd.extend(["--index-url",
|
|
"%s/simple" % os.environ["PYPI_MIRROR"]])
|
|
else:
|
|
# Prevent fetching additional packages from the index. If we forget to add a package
|
|
# to one of the requirements.txt files, this should trigger an error. However, we will
|
|
# still access the index for version/dependency resolution, hence we need to change it
|
|
# when using a private mirror.
|
|
third_party_pkg_install_cmd.append("--no-index")
|
|
|
|
third_party_pkg_install_cmd.extend(["--find-links",
|
|
"file://%s" % urllib.pathname2url(os.path.abspath(DEPS_DIR))])
|
|
third_party_pkg_install_cmd.extend(args)
|
|
exec_cmd(third_party_pkg_install_cmd, env=env)
|
|
|
|
# Finally, we want to install the packages from our own internal python lib
|
|
local_package_install_cmd = impala_pip_base_cmd + \
|
|
['-e', os.path.join(os.getenv('IMPALA_HOME'), 'lib', 'python')]
|
|
exec_cmd(local_package_install_cmd)
|
|
|
|
|
|
def find_file(*paths):
|
|
'''Returns the path specified by the glob 'paths', raises an exception if no file is
|
|
found.
|
|
|
|
Ex: find_file('/etc', 'h*sts') --> /etc/hosts
|
|
'''
|
|
path = os.path.join(*paths)
|
|
files = glob.glob(path)
|
|
if len(files) > 1:
|
|
raise Exception("Found too many files at %s: %s" % (path, files))
|
|
if len(files) == 0:
|
|
raise Exception("No file found at %s" % path)
|
|
return files[0]
|
|
|
|
|
|
def download_toolchain_python():
|
|
'''Grabs the Python implementation from the Impala toolchain, using the machinery from
|
|
bin/bootstrap_toolchain.py
|
|
'''
|
|
|
|
toolchain_root = os.environ.get("IMPALA_TOOLCHAIN")
|
|
if not toolchain_root:
|
|
raise Exception(
|
|
"Impala environment not set up correctly, make sure $IMPALA_TOOLCHAIN is set.")
|
|
|
|
package = ToolchainPackage("python")
|
|
package.download()
|
|
python_cmd = os.path.join(package.pkg_directory(), "bin/python")
|
|
if not os.path.exists(python_cmd):
|
|
raise Exception("Unexpected error bootstrapping python from toolchain: {0} does not "
|
|
"exist".format(python_cmd))
|
|
return python_cmd
|
|
|
|
|
|
def install_deps():
|
|
LOG.info("Installing packages into the virtualenv")
|
|
exec_pip_install(["-r", REQS_PATH])
|
|
mark_reqs_installed(REQS_PATH)
|
|
LOG.info("Installing stage 2 packages into the virtualenv")
|
|
exec_pip_install(["-r", REQS2_PATH])
|
|
mark_reqs_installed(REQS2_PATH)
|
|
|
|
def have_toolchain():
|
|
'''Return true if the Impala toolchain is available'''
|
|
return "IMPALA_TOOLCHAIN" in os.environ
|
|
|
|
def toolchain_pkg_dir(pkg_name):
|
|
'''Return the path to the toolchain package'''
|
|
pkg_version = os.environ["IMPALA_" + pkg_name.upper() + "_VERSION"]
|
|
return os.path.join(os.environ["IMPALA_TOOLCHAIN"], pkg_name + "-" + pkg_version)
|
|
|
|
def install_compiled_deps_if_possible():
|
|
'''Install dependencies that require compilation with toolchain GCC, if the toolchain
|
|
is available. Returns true if the deps are installed'''
|
|
if reqs_are_installed(COMPILED_REQS_PATH):
|
|
LOG.debug("Skipping compiled deps: matching compiled-installed-requirements.txt found")
|
|
return True
|
|
cc = select_cc()
|
|
if cc is None:
|
|
LOG.debug("Skipping compiled deps: cc not available yet")
|
|
return False
|
|
|
|
env = dict(os.environ)
|
|
|
|
# Compilation of pycrypto fails on CentOS 5 with newer GCC versions because of a
|
|
# problem with inline declarations in older libc headers. Setting -fgnu89-inline is a
|
|
# workaround.
|
|
distro_version = ''.join(exec_cmd(["lsb_release", "-irs"]).lower().split())
|
|
print(distro_version)
|
|
if distro_version.startswith("centos5."):
|
|
env["CFLAGS"] = "-fgnu89-inline"
|
|
|
|
LOG.info("Installing compiled requirements into the virtualenv")
|
|
exec_pip_install(["-r", COMPILED_REQS_PATH], cc=cc, env=env)
|
|
mark_reqs_installed(COMPILED_REQS_PATH)
|
|
return True
|
|
|
|
def install_adls_deps():
|
|
# The ADLS dependencies require that the OS is at least CentOS 6.7 or above,
|
|
# which is why we break this into a seperate step. If the target filesystem is
|
|
# ADLS, the expectation is that the dev environment is running at least CentOS 6.7.
|
|
if os.environ.get('TARGET_FILESYSTEM') == "adls":
|
|
if reqs_are_installed(ADLS_REQS_PATH):
|
|
LOG.debug("Skipping ADLS deps: matching adls-installed-requirements.txt found")
|
|
return True
|
|
cc = select_cc()
|
|
assert cc is not None
|
|
LOG.info("Installing ADLS packages into the virtualenv")
|
|
exec_pip_install(["-r", ADLS_REQS_PATH], cc=cc)
|
|
mark_reqs_installed(ADLS_REQS_PATH)
|
|
|
|
def install_kudu_client_if_possible():
|
|
'''Installs the Kudu python module if possible, which depends on the toolchain and
|
|
the compiled requirements in compiled-requirements.txt. If the toolchain isn't
|
|
available, nothing will be done. Also nothing will be done if the Kudu client lib
|
|
required by the module isn't available (as determined by KUDU_IS_SUPPORTED)'''
|
|
if reqs_are_installed(KUDU_REQS_PATH):
|
|
LOG.debug("Skipping Kudu: matching kudu-installed-requirements.txt found")
|
|
return
|
|
if os.environ["KUDU_IS_SUPPORTED"] != "true":
|
|
LOG.debug("Skipping Kudu: Kudu is not supported")
|
|
return
|
|
kudu_base_dir = os.environ["IMPALA_KUDU_HOME"]
|
|
if not os.path.exists(kudu_base_dir):
|
|
LOG.debug("Skipping Kudu: %s doesn't exist" % kudu_base_dir)
|
|
return
|
|
|
|
LOG.info("Installing Kudu into the virtualenv")
|
|
# The installation requires that KUDU_HOME/build/latest exists. An empty directory
|
|
# structure will be made to satisfy that. The Kudu client headers and lib will be made
|
|
# available through GCC environment variables.
|
|
fake_kudu_build_dir = os.path.join(tempfile.gettempdir(), "virtualenv-kudu")
|
|
try:
|
|
artifact_dir = os.path.join(fake_kudu_build_dir, "build", "latest")
|
|
if not os.path.exists(artifact_dir):
|
|
os.makedirs(artifact_dir)
|
|
cc = select_cc()
|
|
assert cc is not None
|
|
env = dict(os.environ)
|
|
env["KUDU_HOME"] = fake_kudu_build_dir
|
|
kudu_client_dir = find_kudu_client_install_dir()
|
|
env["CPLUS_INCLUDE_PATH"] = os.path.join(kudu_client_dir, "include")
|
|
env["LIBRARY_PATH"] = os.path.pathsep.join([os.path.join(kudu_client_dir, 'lib'),
|
|
os.path.join(kudu_client_dir, 'lib64')])
|
|
exec_pip_install(["-r", KUDU_REQS_PATH], cc=cc, env=env)
|
|
mark_reqs_installed(KUDU_REQS_PATH)
|
|
finally:
|
|
try:
|
|
shutil.rmtree(fake_kudu_build_dir)
|
|
except Exception:
|
|
LOG.debug("Error removing temp Kudu build dir", exc_info=True)
|
|
|
|
|
|
def find_kudu_client_install_dir():
|
|
custom_client_dir = os.environ["KUDU_CLIENT_DIR"]
|
|
if custom_client_dir:
|
|
install_dir = os.path.join(custom_client_dir, "usr", "local")
|
|
error_if_kudu_client_not_found(install_dir)
|
|
else:
|
|
# If the toolchain appears to have been setup already, then the Kudu client is
|
|
# required to exist. It's possible that the toolchain won't be setup yet though
|
|
# since the toolchain bootstrap script depends on the virtualenv.
|
|
kudu_base_dir = os.environ["IMPALA_KUDU_HOME"]
|
|
install_dir = os.path.join(kudu_base_dir, "debug")
|
|
if os.path.exists(kudu_base_dir):
|
|
error_if_kudu_client_not_found(install_dir)
|
|
return install_dir
|
|
|
|
|
|
def error_if_kudu_client_not_found(install_dir):
|
|
header_path = os.path.join(install_dir, "include", "kudu", "client", "client.h")
|
|
if not os.path.exists(header_path):
|
|
raise Exception("Kudu client header not found at %s" % header_path)
|
|
|
|
kudu_client_lib = "libkudu_client.so"
|
|
lib_dir = os.path.join(install_dir, "lib64")
|
|
if not os.path.exists(lib_dir):
|
|
lib_dir = os.path.join(install_dir, "lib")
|
|
for _, _, files in os.walk(lib_dir):
|
|
for file in files:
|
|
if file == kudu_client_lib:
|
|
return
|
|
raise Exception("%s not found at %s" % (kudu_client_lib, lib_dir))
|
|
|
|
def mark_reqs_installed(reqs_path):
|
|
'''Mark that the requirements from the given file are installed by copying it into the root
|
|
directory of the virtualenv.'''
|
|
installed_reqs_path = os.path.join(ENV_DIR, os.path.basename(reqs_path))
|
|
shutil.copyfile(reqs_path, installed_reqs_path)
|
|
|
|
def reqs_are_installed(reqs_path):
|
|
'''Check if the requirements from the given file are installed in the virtualenv by
|
|
looking for a matching requirements file in the root directory of the virtualenv.'''
|
|
installed_reqs_path = os.path.join(ENV_DIR, os.path.basename(reqs_path))
|
|
if not os.path.exists(installed_reqs_path):
|
|
return False
|
|
installed_reqs_file = open(installed_reqs_path)
|
|
try:
|
|
reqs_file = open(reqs_path)
|
|
try:
|
|
if reqs_file.read() == installed_reqs_file.read():
|
|
return True
|
|
else:
|
|
LOG.debug("Virtualenv upgrade needed")
|
|
return False
|
|
finally:
|
|
reqs_file.close()
|
|
finally:
|
|
installed_reqs_file.close()
|
|
|
|
def setup_virtualenv_if_not_exists():
|
|
if not (reqs_are_installed(REQS_PATH) and reqs_are_installed(REQS2_PATH)):
|
|
delete_virtualenv_if_exist()
|
|
create_virtualenv()
|
|
install_deps()
|
|
LOG.debug("Virtualenv setup complete")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = optparse.OptionParser()
|
|
parser.add_option("-l", "--log-level", default="INFO",
|
|
choices=("DEBUG", "INFO", "WARN", "ERROR"))
|
|
parser.add_option("-r", "--rebuild", action="store_true", help="Force a rebuild of"
|
|
" the virtualenv even if it exists and appears to be completely up-to-date.")
|
|
parser.add_option("--print-ld-library-path", action="store_true", help="Print the"
|
|
" LD_LIBRARY_PATH that should be used when running python from the virtualenv.")
|
|
options, args = parser.parse_args()
|
|
|
|
if options.print_ld_library_path:
|
|
kudu_client_dir = find_kudu_client_install_dir()
|
|
print(os.path.pathsep.join([os.path.join(kudu_client_dir, 'lib'),
|
|
os.path.join(kudu_client_dir, 'lib64')]))
|
|
sys.exit()
|
|
|
|
logging.basicConfig(level=getattr(logging, options.log_level))
|
|
if options.rebuild:
|
|
delete_virtualenv_if_exist()
|
|
|
|
# Complete as many bootstrap steps as possible (see file comment for the steps).
|
|
setup_virtualenv_if_not_exists()
|
|
if install_compiled_deps_if_possible():
|
|
install_kudu_client_if_possible()
|
|
install_adls_deps()
|