mirror of
https://github.com/apache/impala.git
synced 2025-12-19 18:12:08 -05:00
This upgrades GCC and libstdc++ to version 7.5.0. There
have been ABI changes since 4.9.2, so this means that
the native-toolchain produced with the new compiler is
not interoperable with one produced by the old compiler.
To allow that transition, IMPALA_TOOLCHAIN_PACKAGES_HOME
is now a subdirectory of IMPALA_TOOLCHAIN
(toolchain-packages-gcc${IMPALA_GCC_VERSION}) to distinguish
it from the old packages.
Some Python packages in the impala-python virtualenv are
compiled using the toolchain GCC and now use the new ABI.
This leads to two changes:
1. When constructing the LD_LIBRARY_PATH for impala-python,
we include the GCC libstdc++ libraries. Otherwise, certain
Python packages that use C++ fail on older OSes like Centos 7.
This fixes IMPALA-9804.
2. Since developers work on various branches, this changes
the virtualenv's directory location to a directory with
the GCC version in the name. This allows the virtualenv
built with GCC 7 to coexist with the current virtualenv
built with GCC 4.9.2. The location for the old virtualenv is
${IMPALA_HOME}/infra/python/env. The new location is
${IMPALA_HOME}/infra/python/env-gcc${IMPALA_GCC_VERSION}. This
required updating several impala-python scripts.
There are various odds-and-ends related to the transition:
1. Due to the small string optimization, the size of std::string
changed, which means that various data structures also changed
in size. This required updating some static asserts.
2. There is a bug in clang-tidy that reports a use-after-free
for some code using std::shared_ptr. Clang is not modeling
the shared_ptr correctly, so it is a false-positive. As a workaround,
this disables the clang-analyzer-cplusplus.NewDelete diagnostic.
3. Various small compilation fixes (includes, etc).
Performance testing:
- Ran single-node performance tests on TPC-H for the following
configurations:
- TPC-H Parquet scale 30 with normal configurations
- TPC-H Parquet scale 30 with codegen disabled
- TPC-H Kudu scale 10
None found any significant regressions. Full results are
posted on the JIRA.
- Ran single-node performance tests on targeted-perf scale 10.
No significant regressions.
- The size of binaries (impalad, etc) is slightly smaller with the new GCC:
GCC 4.9.2 release impalad binary: 545664
GCC 7.5.0 release impalad binary: 539900
- Compilation in DEBUG mode is roughly 15-25% faster
Functional testing:
- Ran core jobs, exhaustive release jobs, UBSAN
Change-Id: Ia0beb2b618ba669c9699f8dbc0c52d1203d004e4
Reviewed-on: http://gerrit.cloudera.org:8080/16045
Reviewed-by: Joe McDonnell <joemcdonnell@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
414 lines
17 KiB
Python
414 lines
17 KiB
Python
# Licensed to the Apache Software Foundation (ASF) under one
|
|
# or more contributor license agreements. See the NOTICE file
|
|
# distributed with this work for additional information
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
# to you under the Apache License, Version 2.0 (the
|
|
# "License"); you may not use this file except in compliance
|
|
# with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
# This module will create a python virtual env and install external dependencies. If the
|
|
# virtualenv already exists and it contains all the expected packages, nothing is done.
|
|
#
|
|
# A multi-step bootstrapping process is required to build and install all of the
|
|
# dependencies:
|
|
# 1. install basic non-C/C++ packages into the virtualenv
|
|
# 1b. install packages that depend on step 1 but cannot be installed together with their
|
|
# dependencies
|
|
# 2. use the virtualenv Python to bootstrap the toolchain
|
|
# 3. use toolchain gcc to build C/C++ packages
|
|
# 4. build the kudu-python package with toolchain gcc and Cython
|
|
#
|
|
# Every time this script is run, it completes as many of the bootstrapping steps as
|
|
# possible with the available dependencies.
|
|
#
|
|
# This module can be run with python >= 2.4 but python >= 2.6 must be installed on the
|
|
# system. If the default 'python' command refers to < 2.6, python 2.6 will be used
|
|
# instead.
|
|
|
|
from __future__ import print_function
|
|
import glob
|
|
import logging
|
|
import optparse
|
|
import os
|
|
import shutil
|
|
import subprocess
|
|
import sys
|
|
import tarfile
|
|
import tempfile
|
|
import textwrap
|
|
import urllib
|
|
from bootstrap_toolchain import ToolchainPackage
|
|
|
|
LOG = logging.getLogger(os.path.splitext(os.path.basename(__file__))[0])
|
|
|
|
SKIP_TOOLCHAIN_BOOTSTRAP = "SKIP_TOOLCHAIN_BOOTSTRAP"
|
|
|
|
GCC_VERSION = os.environ["IMPALA_GCC_VERSION"]
|
|
|
|
DEPS_DIR = os.path.join(os.path.dirname(__file__), "deps")
|
|
ENV_DIR = os.path.join(os.path.dirname(__file__), "env-gcc{0}".format(GCC_VERSION))
|
|
|
|
# Requirements file with packages we need for our build and tests.
|
|
REQS_PATH = os.path.join(DEPS_DIR, "requirements.txt")
|
|
|
|
# Second stage of requirements which cannot be installed together with their dependencies
|
|
# in requirements.txt.
|
|
REQS2_PATH = os.path.join(DEPS_DIR, "stage2-requirements.txt")
|
|
|
|
# Requirements for the next bootstrapping step that builds compiled requirements
|
|
# with toolchain gcc.
|
|
COMPILED_REQS_PATH = os.path.join(DEPS_DIR, "compiled-requirements.txt")
|
|
|
|
# Requirements for the Kudu bootstrapping step, which depends on Cython being installed
|
|
# by the compiled requirements step.
|
|
KUDU_REQS_PATH = os.path.join(DEPS_DIR, "kudu-requirements.txt")
|
|
|
|
# Requirements for the ADLS test client step, which depends on Cffi (C Foreign Function
|
|
# Interface) being installed by the compiled requirements step.
|
|
ADLS_REQS_PATH = os.path.join(DEPS_DIR, "adls-requirements.txt")
|
|
|
|
def delete_virtualenv_if_exist():
|
|
if os.path.exists(ENV_DIR):
|
|
shutil.rmtree(ENV_DIR)
|
|
|
|
|
|
def create_virtualenv():
|
|
LOG.info("Creating python virtualenv")
|
|
build_dir = tempfile.mkdtemp()
|
|
file = tarfile.open(find_file(DEPS_DIR, "virtualenv*.tar.gz"), "r:gz")
|
|
for member in file.getmembers():
|
|
file.extract(member, build_dir)
|
|
file.close()
|
|
python_cmd = download_toolchain_python()
|
|
exec_cmd([python_cmd, find_file(build_dir, "virtualenv*", "virtualenv.py"), "--quiet",
|
|
"--python", python_cmd, ENV_DIR])
|
|
shutil.rmtree(build_dir)
|
|
|
|
|
|
def exec_cmd(args, **kwargs):
|
|
'''Executes a command and waits for it to finish, raises an exception if the return
|
|
status is not zero. The command output is returned.
|
|
|
|
'args' and 'kwargs' use the same format as subprocess.Popen().
|
|
'''
|
|
process = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
|
|
**kwargs)
|
|
output = process.communicate()[0]
|
|
if process.returncode != 0:
|
|
raise Exception("Command returned non-zero status\nCommand: %s\nOutput: %s"
|
|
% (args, output))
|
|
return output
|
|
|
|
def use_ccache():
|
|
'''Returns true if ccache is available and should be used'''
|
|
if 'DISABLE_CCACHE' in os.environ: return False
|
|
try:
|
|
exec_cmd(['ccache', '-V'])
|
|
return True
|
|
except:
|
|
return False
|
|
|
|
def select_cc():
|
|
'''Return the C compiler command that should be used as a string or None if the
|
|
compiler is not available '''
|
|
# Use toolchain gcc for ABI compatibility with other toolchain packages, e.g.
|
|
# Kudu/kudu-python
|
|
if not have_toolchain(): return None
|
|
toolchain_gcc_dir = toolchain_pkg_dir("gcc")
|
|
cc = os.path.join(toolchain_gcc_dir, "bin/gcc")
|
|
if not os.path.exists(cc): return None
|
|
if use_ccache(): cc = "ccache %s" % cc
|
|
return cc
|
|
|
|
def exec_pip_install(args, cc="no-cc-available", env=None):
|
|
'''Executes "pip install" with the provided command line arguments. If 'cc' is set,
|
|
it is used as the C compiler. Otherwise compilation of C/C++ code is disabled by
|
|
setting the CC environment variable to a bogus value.
|
|
Other environment vars can optionally be set with the 'env' argument. By default the
|
|
current process's command line arguments are inherited.'''
|
|
if not env: env = dict(os.environ)
|
|
env["CC"] = cc
|
|
|
|
# Parallelize the slow numpy build.
|
|
# Use getconf instead of nproc because it is supported more widely, e.g. on older
|
|
# linux distributions.
|
|
env["NPY_NUM_BUILD_JOBS"] = exec_cmd(["getconf", "_NPROCESSORS_ONLN"]).strip()
|
|
|
|
# Don't call the virtualenv pip directly, it uses a hashbang to to call the python
|
|
# virtualenv using an absolute path. If the path to the virtualenv is very long, the
|
|
# hashbang won't work.
|
|
impala_pip_base_cmd = [os.path.join(ENV_DIR, "bin", "python"),
|
|
os.path.join(ENV_DIR, "bin", "pip"), "install", "-v"]
|
|
|
|
# Passes --no-binary for IMPALA-3767: without this, Cython (and
|
|
# several other packages) fail download.
|
|
#
|
|
# --no-cache-dir is used to prevent caching of compiled artifacts, which may be built
|
|
# with different compilers or settings.
|
|
third_party_pkg_install_cmd = \
|
|
impala_pip_base_cmd[:] + ["--no-binary", ":all:", "--no-cache-dir"]
|
|
|
|
# When using a custom mirror, we also must use the index of that mirror.
|
|
if "PYPI_MIRROR" in os.environ:
|
|
third_party_pkg_install_cmd.extend(["--index-url",
|
|
"%s/simple" % os.environ["PYPI_MIRROR"]])
|
|
else:
|
|
# Prevent fetching additional packages from the index. If we forget to add a package
|
|
# to one of the requirements.txt files, this should trigger an error. However, we will
|
|
# still access the index for version/dependency resolution, hence we need to change it
|
|
# when using a private mirror.
|
|
third_party_pkg_install_cmd.append("--no-index")
|
|
|
|
third_party_pkg_install_cmd.extend(["--find-links",
|
|
"file://%s" % urllib.pathname2url(os.path.abspath(DEPS_DIR))])
|
|
third_party_pkg_install_cmd.extend(args)
|
|
exec_cmd(third_party_pkg_install_cmd, env=env)
|
|
|
|
# Finally, we want to install the packages from our own internal python lib
|
|
local_package_install_cmd = impala_pip_base_cmd + \
|
|
['-e', os.path.join(os.getenv('IMPALA_HOME'), 'lib', 'python')]
|
|
exec_cmd(local_package_install_cmd)
|
|
|
|
|
|
def find_file(*paths):
|
|
'''Returns the path specified by the glob 'paths', raises an exception if no file is
|
|
found.
|
|
|
|
Ex: find_file('/etc', 'h*sts') --> /etc/hosts
|
|
'''
|
|
path = os.path.join(*paths)
|
|
files = glob.glob(path)
|
|
if len(files) > 1:
|
|
raise Exception("Found too many files at %s: %s" % (path, files))
|
|
if len(files) == 0:
|
|
raise Exception("No file found at %s" % path)
|
|
return files[0]
|
|
|
|
|
|
def download_toolchain_python():
|
|
'''Grabs the Python implementation from the Impala toolchain, using the machinery from
|
|
bin/bootstrap_toolchain.py.
|
|
Skip the download if SKIP_TOOLCHAIN_BOOTSTRAP=true in the environment. In that case
|
|
only the presence of the Python executable is checked in the toolchain location.
|
|
'''
|
|
|
|
toolchain_packages_home = os.environ.get("IMPALA_TOOLCHAIN_PACKAGES_HOME")
|
|
if not toolchain_packages_home:
|
|
raise Exception("Impala environment not set up correctly, make sure "
|
|
"$IMPALA_TOOLCHAIN_PACKAGES_HOME is set.")
|
|
|
|
package = ToolchainPackage("python")
|
|
if not (os.environ.get(SKIP_TOOLCHAIN_BOOTSTRAP) == 'true'):
|
|
package.download()
|
|
python_cmd = os.path.join(package.pkg_directory(), "bin/python")
|
|
if not os.path.exists(python_cmd):
|
|
raise Exception("Unexpected error bootstrapping python from toolchain: {0} does not "
|
|
"exist".format(python_cmd))
|
|
return python_cmd
|
|
|
|
|
|
def install_deps():
|
|
LOG.info("Installing packages into the virtualenv")
|
|
exec_pip_install(["-r", REQS_PATH])
|
|
mark_reqs_installed(REQS_PATH)
|
|
LOG.info("Installing stage 2 packages into the virtualenv")
|
|
exec_pip_install(["-r", REQS2_PATH])
|
|
mark_reqs_installed(REQS2_PATH)
|
|
|
|
def have_toolchain():
|
|
'''Return true if the Impala toolchain is available'''
|
|
return "IMPALA_TOOLCHAIN_PACKAGES_HOME" in os.environ
|
|
|
|
def toolchain_pkg_dir(pkg_name):
|
|
'''Return the path to the toolchain package'''
|
|
pkg_version = os.environ["IMPALA_" + pkg_name.upper() + "_VERSION"]
|
|
return os.path.join(os.environ["IMPALA_TOOLCHAIN_PACKAGES_HOME"],
|
|
pkg_name + "-" + pkg_version)
|
|
|
|
def install_compiled_deps_if_possible():
|
|
'''Install dependencies that require compilation with toolchain GCC, if the toolchain
|
|
is available. Returns true if the deps are installed'''
|
|
if reqs_are_installed(COMPILED_REQS_PATH):
|
|
LOG.debug("Skipping compiled deps: matching compiled-installed-requirements.txt found")
|
|
return True
|
|
cc = select_cc()
|
|
if cc is None:
|
|
LOG.debug("Skipping compiled deps: cc not available yet")
|
|
return False
|
|
|
|
env = dict(os.environ)
|
|
|
|
# Compilation of pycrypto fails on CentOS 5 with newer GCC versions because of a
|
|
# problem with inline declarations in older libc headers. Setting -fgnu89-inline is a
|
|
# workaround.
|
|
distro_version = ''.join(exec_cmd(["lsb_release", "-irs"]).lower().split())
|
|
print(distro_version)
|
|
if distro_version.startswith("centos5."):
|
|
env["CFLAGS"] = "-fgnu89-inline"
|
|
|
|
LOG.info("Installing compiled requirements into the virtualenv")
|
|
exec_pip_install(["-r", COMPILED_REQS_PATH], cc=cc, env=env)
|
|
mark_reqs_installed(COMPILED_REQS_PATH)
|
|
return True
|
|
|
|
def install_adls_deps():
|
|
# The ADLS dependencies require that the OS is at least CentOS 6.7 or above,
|
|
# which is why we break this into a seperate step. If the target filesystem is
|
|
# ADLS, the expectation is that the dev environment is running at least CentOS 6.7.
|
|
if os.environ.get('TARGET_FILESYSTEM') == "adls":
|
|
if reqs_are_installed(ADLS_REQS_PATH):
|
|
LOG.debug("Skipping ADLS deps: matching adls-installed-requirements.txt found")
|
|
return True
|
|
cc = select_cc()
|
|
assert cc is not None
|
|
LOG.info("Installing ADLS packages into the virtualenv")
|
|
exec_pip_install(["-r", ADLS_REQS_PATH], cc=cc)
|
|
mark_reqs_installed(ADLS_REQS_PATH)
|
|
|
|
def install_kudu_client_if_possible():
|
|
'''Installs the Kudu python module if possible, which depends on the toolchain and
|
|
the compiled requirements in compiled-requirements.txt. If the toolchain isn't
|
|
available, nothing will be done. Also nothing will be done if the Kudu client lib
|
|
required by the module isn't available (as determined by KUDU_IS_SUPPORTED)'''
|
|
if reqs_are_installed(KUDU_REQS_PATH):
|
|
LOG.debug("Skipping Kudu: matching kudu-installed-requirements.txt found")
|
|
return
|
|
if os.environ["KUDU_IS_SUPPORTED"] != "true":
|
|
LOG.debug("Skipping Kudu: Kudu is not supported")
|
|
return
|
|
kudu_base_dir = os.environ["IMPALA_KUDU_HOME"]
|
|
if not os.path.exists(kudu_base_dir):
|
|
LOG.debug("Skipping Kudu: %s doesn't exist" % kudu_base_dir)
|
|
return
|
|
|
|
LOG.info("Installing Kudu into the virtualenv")
|
|
# The installation requires that KUDU_HOME/build/latest exists. An empty directory
|
|
# structure will be made to satisfy that. The Kudu client headers and lib will be made
|
|
# available through GCC environment variables.
|
|
fake_kudu_build_dir = os.path.join(tempfile.gettempdir(), "virtualenv-kudu")
|
|
try:
|
|
artifact_dir = os.path.join(fake_kudu_build_dir, "build", "latest")
|
|
if not os.path.exists(artifact_dir):
|
|
os.makedirs(artifact_dir)
|
|
cc = select_cc()
|
|
assert cc is not None
|
|
env = dict(os.environ)
|
|
env["KUDU_HOME"] = fake_kudu_build_dir
|
|
kudu_client_dir = find_kudu_client_install_dir()
|
|
env["CPLUS_INCLUDE_PATH"] = os.path.join(kudu_client_dir, "include")
|
|
env["LIBRARY_PATH"] = os.path.pathsep.join([os.path.join(kudu_client_dir, 'lib'),
|
|
os.path.join(kudu_client_dir, 'lib64')])
|
|
exec_pip_install(["-r", KUDU_REQS_PATH], cc=cc, env=env)
|
|
mark_reqs_installed(KUDU_REQS_PATH)
|
|
finally:
|
|
try:
|
|
shutil.rmtree(fake_kudu_build_dir)
|
|
except Exception:
|
|
LOG.debug("Error removing temp Kudu build dir", exc_info=True)
|
|
|
|
|
|
def find_kudu_client_install_dir():
|
|
custom_client_dir = os.environ["KUDU_CLIENT_DIR"]
|
|
if custom_client_dir:
|
|
install_dir = os.path.join(custom_client_dir, "usr", "local")
|
|
error_if_kudu_client_not_found(install_dir)
|
|
else:
|
|
# If the toolchain appears to have been setup already, then the Kudu client is
|
|
# required to exist. It's possible that the toolchain won't be setup yet though
|
|
# since the toolchain bootstrap script depends on the virtualenv.
|
|
kudu_base_dir = os.environ["IMPALA_KUDU_HOME"]
|
|
install_dir = os.path.join(kudu_base_dir, "debug")
|
|
if os.path.exists(kudu_base_dir):
|
|
error_if_kudu_client_not_found(install_dir)
|
|
return install_dir
|
|
|
|
|
|
def error_if_kudu_client_not_found(install_dir):
|
|
header_path = os.path.join(install_dir, "include", "kudu", "client", "client.h")
|
|
if not os.path.exists(header_path):
|
|
raise Exception("Kudu client header not found at %s" % header_path)
|
|
|
|
kudu_client_lib = "libkudu_client.so"
|
|
lib_dir = os.path.join(install_dir, "lib64")
|
|
if not os.path.exists(lib_dir):
|
|
lib_dir = os.path.join(install_dir, "lib")
|
|
for _, _, files in os.walk(lib_dir):
|
|
for file in files:
|
|
if file == kudu_client_lib:
|
|
return
|
|
raise Exception("%s not found at %s" % (kudu_client_lib, lib_dir))
|
|
|
|
def mark_reqs_installed(reqs_path):
|
|
'''Mark that the requirements from the given file are installed by copying it into the root
|
|
directory of the virtualenv.'''
|
|
installed_reqs_path = os.path.join(ENV_DIR, os.path.basename(reqs_path))
|
|
shutil.copyfile(reqs_path, installed_reqs_path)
|
|
|
|
def reqs_are_installed(reqs_path):
|
|
'''Check if the requirements from the given file are installed in the virtualenv by
|
|
looking for a matching requirements file in the root directory of the virtualenv.'''
|
|
installed_reqs_path = os.path.join(ENV_DIR, os.path.basename(reqs_path))
|
|
if not os.path.exists(installed_reqs_path):
|
|
return False
|
|
installed_reqs_file = open(installed_reqs_path)
|
|
try:
|
|
reqs_file = open(reqs_path)
|
|
try:
|
|
if reqs_file.read() == installed_reqs_file.read():
|
|
return True
|
|
else:
|
|
LOG.debug("Virtualenv upgrade needed")
|
|
return False
|
|
finally:
|
|
reqs_file.close()
|
|
finally:
|
|
installed_reqs_file.close()
|
|
|
|
def setup_virtualenv_if_not_exists():
|
|
if not (reqs_are_installed(REQS_PATH) and reqs_are_installed(REQS2_PATH)):
|
|
delete_virtualenv_if_exist()
|
|
create_virtualenv()
|
|
install_deps()
|
|
LOG.debug("Virtualenv setup complete")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = optparse.OptionParser()
|
|
parser.add_option("-l", "--log-level", default="INFO",
|
|
choices=("DEBUG", "INFO", "WARN", "ERROR"))
|
|
parser.add_option("-r", "--rebuild", action="store_true", help="Force a rebuild of"
|
|
" the virtualenv even if it exists and appears to be completely up-to-date.")
|
|
parser.add_option("--print-ld-library-path", action="store_true", help="Print the"
|
|
" LD_LIBRARY_PATH that should be used when running python from the virtualenv.")
|
|
options, args = parser.parse_args()
|
|
|
|
if options.print_ld_library_path:
|
|
# Some python packages have native code that is compiled with the toolchain
|
|
# compiler, so that code needs to dynamically link against matching library
|
|
# versions.
|
|
ld_library_dirs = [os.path.join(toolchain_pkg_dir("gcc"), 'lib64')]
|
|
kudu_client_dir = find_kudu_client_install_dir()
|
|
ld_library_dirs.append(os.path.join(kudu_client_dir, 'lib'))
|
|
ld_library_dirs.append(os.path.join(kudu_client_dir, 'lib64'))
|
|
print(os.path.pathsep.join(ld_library_dirs))
|
|
sys.exit()
|
|
|
|
logging.basicConfig(level=getattr(logging, options.log_level))
|
|
if options.rebuild:
|
|
delete_virtualenv_if_exist()
|
|
|
|
# Complete as many bootstrap steps as possible (see file comment for the steps).
|
|
setup_virtualenv_if_not_exists()
|
|
if install_compiled_deps_if_possible():
|
|
install_kudu_client_if_possible()
|
|
install_adls_deps()
|