mirror of
https://github.com/apache/impala.git
synced 2025-12-19 18:12:08 -05:00
As part of moving to a newer protobuf, this updates the Kudu version
to get the fix for KUDU-3334. With this newer Kudu version, Clang
builds hit an error while linking:
lib/libLLVMCodeGen.a(TargetPassConfig.cpp.o):TargetPassConfig.cpp:
function llvm::TargetPassConfig::createRegAllocPass(bool):
error: relocation refers to global symbol "std::call_once<void (&)()>(std::once_flag&, void (&)())::{lambda()#2}::_FUN()",
which is defined in a discarded section
section group signature: "_ZZSt9call_onceIRFvvEJEEvRSt9once_flagOT_DpOT0_ENKUlvE0_clEv"
prevailing definition is from ../../build/debug/security/libsecurity.a(openssl_util.cc.o)
(This is from a newer binutils that will be pursued separately.)
As a hack to get around this error, this adds the calloncehack
shared library. The shared library publicly defines the symbol that
was coming from kudu_client. By linking it ahead of kudu_client, the
linker uses that rather than the one from kudu_client. This fixes
the Clang builds.
The new Kudu also requires a minor change to the flags for tserver
startup.
Testing:
- Ran debug tests and verified calloncehack is not used
- Ran ASAN tests
Change-Id: Ieccbe284f11445e1de792352ebc7c9e1fa2ca0c3
Reviewed-on: http://gerrit.cloudera.org:8080/18129
Reviewed-by: Wenzhe Zhou <wzhou@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
568 lines
26 KiB
Python
Executable File
568 lines
26 KiB
Python
Executable File
#!/usr/bin/env python
|
|
# Licensed to the Apache Software Foundation (ASF) under one
|
|
# or more contributor license agreements. See the NOTICE file
|
|
# distributed with this work for additional information
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
# to you under the Apache License, Version 2.0 (the
|
|
# "License"); you may not use this file except in compliance
|
|
# with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
#
|
|
# The purpose of this script is to download prebuilt binaries and jar files to satisfy
|
|
# the third-party dependencies for Impala. The script expects bin/impala-config.sh to be
|
|
# sourced to initialize various environment variables (including environment variables
|
|
# specifying the versions for components). It verifies that bin/impala-config.sh
|
|
# has been sourced by verifying that IMPALA_HOME is set. This script will fail if an
|
|
# expected environment variable is not present.
|
|
#
|
|
# To share a toolchain directory between multiple checkouts of Impala (or to use a
|
|
# cached copy to avoid downloading a new one), it is best to override IMPALA_TOOLCHAIN
|
|
# in bin/impala-config-local.sh or in the environment prior to sourcing
|
|
# bin/impala-config.sh. This sets IMPALA_TOOLCHAIN_PACKAGES_HOME as well as
|
|
# CDP_COMPONENTS_HOME.
|
|
#
|
|
# The following environment variables control the behavior of this script:
|
|
# IMPALA_TOOLCHAIN_PACKAGES_HOME - Directory in which to place the native-toolchain
|
|
# packages.
|
|
# IMPALA_TOOLCHAIN_HOST - The host to use for downloading the artifacts
|
|
# CDP_COMPONENTS_HOME - Directory to store CDP Hadoop component artifacts
|
|
# CDP_BUILD_NUMBER - CDP Hadoop components are built with consistent versions so that
|
|
# Hadoop, Hive, Kudu, etc are all built with versions that are compatible with each
|
|
# other. The way to specify a single consistent set of components is via a build
|
|
# number. This determines the location in s3 to get the artifacts.
|
|
# DOWNLOAD_CDH_COMPONENTS - When set to true, this script will also download and extract
|
|
# the CDP Hadoop components (i.e. Hadoop, Hive, HBase, Ranger, etc) into
|
|
# CDP_COMPONENTS_HOME as appropriate.
|
|
# IMPALA_<PACKAGE>_VERSION - The version expected for <PACKAGE>. This is typically
|
|
# configured in bin/impala-config.sh and must exist for every package. This is used
|
|
# to construct an appropriate URL and expected archive name.
|
|
# IMPALA_<PACKAGE>_URL - This overrides the download URL for <PACKAGE>. The URL must
|
|
# end with the expected archive name for this package. This is usually used in
|
|
# bin/impala-config-branch.sh or bin/impala-config-local.sh when using custom
|
|
# versions or packages. When this is not specified, packages are downloaded from
|
|
# an S3 bucket named native-toolchain, and the exact URL is based on
|
|
# IMPALA_<PACKAGE>_VERSION as well as the OS version being built on.
|
|
#
|
|
# The script is directly executable, and it takes no parameters:
|
|
# ./bootstrap_toolchain.py
|
|
import logging
|
|
import glob
|
|
import multiprocessing.pool
|
|
import os
|
|
import platform
|
|
import random
|
|
import re
|
|
import shutil
|
|
import subprocess
|
|
import sys
|
|
import tempfile
|
|
import time
|
|
|
|
from collections import namedtuple
|
|
from string import Template
|
|
|
|
# Maps return values from 'lsb_release -irs' to the corresponding OS labels for both the
|
|
# toolchain and the CDP components.
|
|
# For Ubuntu20.04, the toolchain and CDP components to be mapped to are still 18.04
|
|
# based, due to the unavailability of 20.04 parts on EC2.
|
|
OsMapping = namedtuple('OsMapping', ['lsb_release', 'toolchain', 'cdh'])
|
|
OS_MAPPING = [
|
|
OsMapping("centos5", "ec2-package-centos-5", None),
|
|
OsMapping("centos6", "ec2-package-centos-6", "redhat6"),
|
|
OsMapping("centos7", "ec2-package-centos-7", "redhat7"),
|
|
OsMapping("centos8", "ec2-package-centos-8", "redhat8"),
|
|
OsMapping("redhatenterpriseserver5", "ec2-package-centos-5", None),
|
|
OsMapping("redhatenterpriseserver6", "ec2-package-centos-6", "redhat6"),
|
|
OsMapping("redhatenterpriseserver7", "ec2-package-centos-7", "redhat7"),
|
|
OsMapping("redhatenterprise8", "ec2-package-centos-8", "redhat8"),
|
|
OsMapping("redhatenterpriseserver8", "ec2-package-centos-8", "redhat8"),
|
|
OsMapping("debian6", "ec2-package-debian-6", None),
|
|
OsMapping("debian7", "ec2-package-debian-7", None),
|
|
OsMapping("debian8", "ec2-package-debian-8", "debian8"),
|
|
OsMapping("suselinux11", "ec2-package-sles-11", None),
|
|
OsMapping("suselinux12", "ec2-package-sles-12", "sles12"),
|
|
OsMapping("suse12", "ec2-package-sles-12", "sles12"),
|
|
OsMapping("ubuntu12.04", "ec2-package-ubuntu-12-04", None),
|
|
OsMapping("ubuntu14.04", "ec2-package-ubuntu-14-04", None),
|
|
OsMapping("ubuntu15.04", "ec2-package-ubuntu-14-04", None),
|
|
OsMapping("ubuntu15.10", "ec2-package-ubuntu-14-04", None),
|
|
OsMapping('ubuntu16.04', "ec2-package-ubuntu-16-04", "ubuntu1604"),
|
|
OsMapping('ubuntu18.04', "ec2-package-ubuntu-18-04", "ubuntu1804"),
|
|
OsMapping('ubuntu20.04', "ec2-package-ubuntu-18-04", "ubuntu1804")
|
|
]
|
|
|
|
|
|
def check_output(cmd_args):
|
|
"""Run the command and return the output. Raise an exception if the command returns
|
|
a non-zero return code. Similar to subprocess.check_output() which is only provided
|
|
in python 2.7.
|
|
"""
|
|
process = subprocess.Popen(cmd_args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
|
|
stdout, _ = process.communicate()
|
|
if process.wait() != 0:
|
|
raise Exception("Command with args '%s' failed with exit code %s:\n%s"
|
|
% (cmd_args, process.returncode, stdout))
|
|
return stdout
|
|
|
|
|
|
def get_toolchain_compiler():
|
|
"""Return the <name>-<version> string for the compiler package to use for the
|
|
toolchain."""
|
|
# Currently we always use GCC.
|
|
return "gcc-{0}".format(os.environ["IMPALA_GCC_VERSION"])
|
|
|
|
|
|
def wget_and_unpack_package(download_path, file_name, destination, wget_no_clobber):
|
|
if not download_path.endswith("/" + file_name):
|
|
raise Exception("URL {0} does not match with expected file_name {1}"
|
|
.format(download_path, file_name))
|
|
if "closer.cgi" in download_path:
|
|
download_path += "?action=download"
|
|
NUM_ATTEMPTS = 3
|
|
for attempt in range(1, NUM_ATTEMPTS + 1):
|
|
logging.info("Downloading {0} to {1}/{2} (attempt {3})".format(
|
|
download_path, destination, file_name, attempt))
|
|
# --no-clobber avoids downloading the file if a file with the name already exists
|
|
try:
|
|
cmd = ["wget", download_path,
|
|
"--output-document={0}/{1}".format(destination, file_name)]
|
|
if wget_no_clobber:
|
|
cmd.append("--no-clobber")
|
|
check_output(cmd)
|
|
break
|
|
except Exception, e:
|
|
if attempt == NUM_ATTEMPTS:
|
|
raise
|
|
logging.error("Download failed; retrying after sleep: " + str(e))
|
|
time.sleep(10 + random.random() * 5) # Sleep between 10 and 15 seconds.
|
|
logging.info("Extracting {0}".format(file_name))
|
|
check_output(["tar", "xzf", os.path.join(destination, file_name),
|
|
"--directory={0}".format(destination)])
|
|
os.unlink(os.path.join(destination, file_name))
|
|
|
|
|
|
class DownloadUnpackTarball(object):
|
|
"""
|
|
The basic unit of work for bootstrapping the toolchain is:
|
|
- check if a package is already present (via the needs_download() method)
|
|
- if it is not, download a tarball and unpack it into the appropriate directory
|
|
(via the download() method)
|
|
In this base case, everything is known: the url to download from, the archive to
|
|
unpack, and the destination directory.
|
|
"""
|
|
def __init__(self, url, archive_name, destination_basedir, directory_name, makedir):
|
|
self.url = url
|
|
self.archive_name = archive_name
|
|
assert self.archive_name.endswith(".tar.gz")
|
|
self.archive_basename = self.archive_name.replace(".tar.gz", "")
|
|
self.destination_basedir = destination_basedir
|
|
# destination base directory must exist
|
|
assert os.path.isdir(self.destination_basedir)
|
|
self.directory_name = directory_name
|
|
self.makedir = makedir
|
|
|
|
def pkg_directory(self):
|
|
return os.path.join(self.destination_basedir, self.directory_name)
|
|
|
|
def needs_download(self):
|
|
if os.path.isdir(self.pkg_directory()): return False
|
|
return True
|
|
|
|
def download(self):
|
|
unpack_dir = self.pkg_directory()
|
|
if self.makedir:
|
|
# Download and unpack in a temp directory, which we'll later move into place
|
|
download_dir = tempfile.mkdtemp(dir=self.destination_basedir)
|
|
else:
|
|
download_dir = self.destination_basedir
|
|
try:
|
|
wget_and_unpack_package(self.url, self.archive_name, download_dir, False)
|
|
except: # noqa
|
|
# Clean up any partially-unpacked result.
|
|
if os.path.isdir(unpack_dir):
|
|
shutil.rmtree(unpack_dir)
|
|
# Only delete the download directory if it is a temporary directory
|
|
if download_dir != self.destination_basedir and os.path.isdir(download_dir):
|
|
shutil.rmtree(download_dir)
|
|
raise
|
|
if self.makedir:
|
|
os.rename(download_dir, unpack_dir)
|
|
|
|
|
|
class TemplatedDownloadUnpackTarball(DownloadUnpackTarball):
|
|
def __init__(self, url_tmpl, archive_name_tmpl, destination_basedir_tmpl,
|
|
directory_name_tmpl, makedir, template_subs):
|
|
url = self.__do_substitution(url_tmpl, template_subs)
|
|
archive_name = self.__do_substitution(archive_name_tmpl, template_subs)
|
|
destination_basedir = self.__do_substitution(destination_basedir_tmpl, template_subs)
|
|
directory_name = self.__do_substitution(directory_name_tmpl, template_subs)
|
|
super(TemplatedDownloadUnpackTarball, self).__init__(url, archive_name,
|
|
destination_basedir, directory_name, makedir)
|
|
|
|
def __do_substitution(self, template, template_subs):
|
|
return Template(template).substitute(**template_subs)
|
|
|
|
|
|
class EnvVersionedPackage(TemplatedDownloadUnpackTarball):
|
|
def __init__(self, name, url_prefix_tmpl, destination_basedir, explicit_version=None,
|
|
archive_basename_tmpl=None, unpack_directory_tmpl=None, makedir=False,
|
|
template_subs_in={}):
|
|
template_subs = template_subs_in
|
|
template_subs["name"] = name
|
|
template_subs["version"] = self.__compute_version(name, explicit_version)
|
|
# The common case is that X.tar.gz unpacks to X directory. archive_basename_tmpl
|
|
# allows overriding the value of X (which defaults to ${name}-${version}).
|
|
# If X.tar.gz unpacks to Y directory, then unpack_directory_tmpl allows overriding Y.
|
|
if archive_basename_tmpl is None:
|
|
archive_basename_tmpl = "${name}-${version}"
|
|
archive_name_tmpl = archive_basename_tmpl + ".tar.gz"
|
|
if unpack_directory_tmpl is None:
|
|
unpack_directory_tmpl = archive_basename_tmpl
|
|
url_tmpl = self.__compute_url(name, archive_name_tmpl, url_prefix_tmpl)
|
|
super(EnvVersionedPackage, self).__init__(url_tmpl, archive_name_tmpl,
|
|
destination_basedir, unpack_directory_tmpl, makedir, template_subs)
|
|
|
|
def __compute_version(self, name, explicit_version):
|
|
if explicit_version is not None:
|
|
return explicit_version
|
|
else:
|
|
# When getting the version from the environment, we need to standardize the name
|
|
# to match expected environment variables.
|
|
std_env_name = name.replace("-", "_").upper()
|
|
version_env_var = "IMPALA_{0}_VERSION".format(std_env_name)
|
|
env_version = os.environ.get(version_env_var)
|
|
if not env_version:
|
|
raise Exception("Could not find version for {0} in environment var {1}".format(
|
|
name, version_env_var))
|
|
return env_version
|
|
|
|
def __compute_url(self, name, archive_name_tmpl, url_prefix_tmpl):
|
|
# The URL defined in the environment (IMPALA_*_URL) takes precedence. If that is
|
|
# not defined, use the standard URL (url_prefix + archive_name)
|
|
std_env_name = name.replace("-", "_").upper()
|
|
url_env_var = "IMPALA_{0}_URL".format(std_env_name)
|
|
url_tmpl = os.environ.get(url_env_var)
|
|
if not url_tmpl:
|
|
url_tmpl = os.path.join(url_prefix_tmpl, archive_name_tmpl)
|
|
return url_tmpl
|
|
|
|
|
|
class ToolchainPackage(EnvVersionedPackage):
|
|
def __init__(self, name, explicit_version=None, platform_release=None):
|
|
toolchain_packages_home = os.environ.get("IMPALA_TOOLCHAIN_PACKAGES_HOME")
|
|
if not toolchain_packages_home:
|
|
logging.error("Impala environment not set up correctly, make sure "
|
|
"$IMPALA_TOOLCHAIN_PACKAGES_HOME is set.")
|
|
sys.exit(1)
|
|
compiler = get_toolchain_compiler()
|
|
label = get_platform_release_label(release=platform_release).toolchain
|
|
toolchain_build_id = os.environ["IMPALA_TOOLCHAIN_BUILD_ID"]
|
|
toolchain_host = os.environ["IMPALA_TOOLCHAIN_HOST"]
|
|
template_subs = {'compiler': compiler, 'label': label,
|
|
'toolchain_build_id': toolchain_build_id,
|
|
'toolchain_host': toolchain_host}
|
|
archive_basename_tmpl = "${name}-${version}-${compiler}-${label}"
|
|
url_prefix_tmpl = "https://${toolchain_host}/build/${toolchain_build_id}/" + \
|
|
"${name}/${version}-${compiler}/"
|
|
unpack_directory_tmpl = "${name}-${version}"
|
|
super(ToolchainPackage, self).__init__(name, url_prefix_tmpl,
|
|
toolchain_packages_home,
|
|
explicit_version=explicit_version,
|
|
archive_basename_tmpl=archive_basename_tmpl,
|
|
unpack_directory_tmpl=unpack_directory_tmpl,
|
|
template_subs_in=template_subs)
|
|
|
|
def needs_download(self):
|
|
# If the directory doesn't exist, we need the download
|
|
unpack_dir = self.pkg_directory()
|
|
if not os.path.isdir(unpack_dir): return True
|
|
version_file = os.path.join(unpack_dir, "toolchain_package_version.txt")
|
|
if not os.path.exists(version_file): return True
|
|
with open(version_file, "r") as f:
|
|
return f.read().strip() != self.archive_basename
|
|
|
|
def download(self):
|
|
# Remove the existing package directory if it exists (since this has additional
|
|
# conditions as part of needs_download())
|
|
unpack_dir = self.pkg_directory()
|
|
if os.path.exists(unpack_dir):
|
|
logging.info("Removing existing package directory {0}".format(unpack_dir))
|
|
shutil.rmtree(unpack_dir)
|
|
super(ToolchainPackage, self).download()
|
|
# Write the toolchain_package_version.txt file
|
|
version_file = os.path.join(unpack_dir, "toolchain_package_version.txt")
|
|
with open(version_file, "w") as f:
|
|
f.write(self.archive_basename)
|
|
|
|
|
|
class CdpComponent(EnvVersionedPackage):
|
|
def __init__(self, name, explicit_version=None, archive_basename_tmpl=None,
|
|
unpack_directory_tmpl=None, makedir=False):
|
|
# Compute the CDP base URL (based on the IMPALA_TOOLCHAIN_HOST and CDP_BUILD_NUMBER)
|
|
if "IMPALA_TOOLCHAIN_HOST" not in os.environ or "CDP_BUILD_NUMBER" not in os.environ:
|
|
logging.error("Impala environment not set up correctly, make sure "
|
|
"impala-config.sh is sourced.")
|
|
sys.exit(1)
|
|
template_subs = {"toolchain_host": os.environ["IMPALA_TOOLCHAIN_HOST"],
|
|
"cdp_build_number": os.environ["CDP_BUILD_NUMBER"]}
|
|
url_prefix_tmpl = "https://${toolchain_host}/build/cdp_components/" + \
|
|
"${cdp_build_number}/tarballs/"
|
|
|
|
# Get the output base directory from CDP_COMPONENTS_HOME
|
|
destination_basedir = os.environ["CDP_COMPONENTS_HOME"]
|
|
super(CdpComponent, self).__init__(name, url_prefix_tmpl, destination_basedir,
|
|
explicit_version=explicit_version,
|
|
archive_basename_tmpl=archive_basename_tmpl,
|
|
unpack_directory_tmpl=unpack_directory_tmpl,
|
|
makedir=makedir, template_subs_in=template_subs)
|
|
|
|
|
|
class ApacheComponent(EnvVersionedPackage):
|
|
def __init__(self, name, explicit_version=None, archive_basename_tmpl=None,
|
|
unpack_directory_tmpl=None, makedir=False, component_path_tmpl=None):
|
|
# Compute the apache base URL (based on the APACHE_MIRROR)
|
|
if "APACHE_COMPONENTS_HOME" not in os.environ:
|
|
logging.error("Impala environment not set up correctly, make sure "
|
|
"impala-config.sh is sourced.")
|
|
sys.exit(1)
|
|
template_subs = {"apache_mirror": os.environ["APACHE_MIRROR"]}
|
|
# Different components have different sub-paths. For example, hive is hive/hive-xxx,
|
|
# hadoop is hadoop/common/hadoop-xxx. The default is hive format.
|
|
if component_path_tmpl is None:
|
|
component_path_tmpl = "${name}/${name}-${version}/"
|
|
url_prefix_tmpl = "${apache_mirror}/" + component_path_tmpl
|
|
|
|
# Get the output base directory from APACHE_COMPONENTS_HOME
|
|
destination_basedir = os.environ["APACHE_COMPONENTS_HOME"]
|
|
super(ApacheComponent, self).__init__(name, url_prefix_tmpl, destination_basedir,
|
|
explicit_version=explicit_version,
|
|
archive_basename_tmpl=archive_basename_tmpl,
|
|
unpack_directory_tmpl=unpack_directory_tmpl,
|
|
makedir=makedir, template_subs_in=template_subs)
|
|
|
|
class ToolchainKudu(ToolchainPackage):
|
|
def __init__(self, platform_label=None):
|
|
super(ToolchainKudu, self).__init__('kudu', platform_release=platform_label)
|
|
|
|
def needs_download(self):
|
|
# This verifies that the unpack directory exists
|
|
if super(ToolchainKudu, self).needs_download():
|
|
return True
|
|
# Additional check to distinguish this from the Kudu Java package
|
|
# Regardless of the actual build type, the 'kudu' tarball will always contain a
|
|
# 'debug' and a 'release' directory.
|
|
if not os.path.exists(os.path.join(self.pkg_directory(), "debug")):
|
|
return True
|
|
# Both the pkg_directory and the debug directory exist
|
|
return False
|
|
|
|
|
|
def try_get_platform_release_label():
|
|
"""Gets the right package label from the OS version. Returns an OsMapping with both
|
|
'toolchain' and 'cdh' labels. Return None if not found.
|
|
"""
|
|
try:
|
|
return get_platform_release_label()
|
|
except Exception:
|
|
return None
|
|
|
|
|
|
# Cache "lsb_release -irs" to avoid excessive logging from sh, and
|
|
# to shave a little bit of time.
|
|
lsb_release_cache = None
|
|
|
|
|
|
def get_platform_release_label(release=None):
|
|
"""Gets the right package label from the OS version. Raise exception if not found.
|
|
'release' can be provided to override the underlying OS version.
|
|
"""
|
|
global lsb_release_cache
|
|
if not release:
|
|
if lsb_release_cache:
|
|
release = lsb_release_cache
|
|
else:
|
|
lsb_release = check_output(["lsb_release", "-irs"])
|
|
release = "".join(map(lambda x: x.lower(), lsb_release.split()))
|
|
# Only need to check against the major release if RHEL, CentOS or Suse
|
|
for distro in ['centos', 'redhatenterprise', 'redhatenterpriseserver', 'suse']:
|
|
if distro in release:
|
|
release = release.split('.')[0]
|
|
break
|
|
lsb_release_cache = release
|
|
for mapping in OS_MAPPING:
|
|
if re.search(mapping.lsb_release, release):
|
|
return mapping
|
|
raise Exception("Could not find package label for OS version: {0}.".format(release))
|
|
|
|
|
|
def check_output(cmd_args):
|
|
"""Run the command and return the output. Raise an exception if the command returns
|
|
a non-zero return code. Similar to subprocess.check_output() which is only provided
|
|
in python 2.7.
|
|
"""
|
|
process = subprocess.Popen(cmd_args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
|
|
stdout, _ = process.communicate()
|
|
if process.wait() != 0:
|
|
raise Exception("Command with args '%s' failed with exit code %s:\n%s"
|
|
% (cmd_args, process.returncode, stdout))
|
|
return stdout
|
|
|
|
|
|
def check_custom_toolchain(toolchain_packages_home, packages):
|
|
missing = []
|
|
for p in packages:
|
|
if not os.path.isdir(p.pkg_directory()):
|
|
missing.append((p, p.pkg_directory()))
|
|
|
|
if missing:
|
|
msg = "The following packages are not in their expected locations.\n"
|
|
for p, pkg_dir in missing:
|
|
msg += " %s (expected directory %s to exist)\n" % (p, pkg_dir)
|
|
msg += "Pre-built toolchain archives not available for your platform.\n"
|
|
msg += "Clone and build native toolchain from source using this repository:\n"
|
|
msg += " https://github.com/cloudera/native-toolchain\n"
|
|
logging.error(msg)
|
|
raise Exception("Toolchain bootstrap failed: required packages were missing")
|
|
|
|
|
|
def execute_many(f, args):
|
|
"""
|
|
Executes f(a) for a in args using a threadpool to execute in parallel.
|
|
The pool uses the smaller of 4 and the number of CPUs in the system
|
|
as the pool size.
|
|
"""
|
|
pool = multiprocessing.pool.ThreadPool(processes=min(multiprocessing.cpu_count(), 4))
|
|
return pool.map(f, args, 1)
|
|
|
|
|
|
def create_directory_from_env_var(env_var):
|
|
dir_name = os.environ.get(env_var)
|
|
if not dir_name:
|
|
logging.error("Impala environment not set up correctly, make sure "
|
|
"{0} is set.".format(env_var))
|
|
sys.exit(1)
|
|
if not os.path.exists(dir_name):
|
|
os.makedirs(dir_name)
|
|
|
|
|
|
def get_toolchain_downloads():
|
|
toolchain_packages = []
|
|
# The LLVM and GCC packages are the largest packages in the toolchain (Kudu is handled
|
|
# separately). Sort them first so their downloads start as soon as possible.
|
|
llvm_package = ToolchainPackage("llvm")
|
|
llvm_package_asserts = ToolchainPackage(
|
|
"llvm", explicit_version=os.environ.get("IMPALA_LLVM_DEBUG_VERSION"))
|
|
gcc_package = ToolchainPackage("gcc")
|
|
toolchain_packages += [llvm_package, llvm_package_asserts, gcc_package]
|
|
toolchain_packages += map(ToolchainPackage,
|
|
["avro", "binutils", "boost", "breakpad", "bzip2", "calloncehack", "cctz", "cmake",
|
|
"crcutil", "curl", "flatbuffers", "gdb", "gflags", "glog", "gperftools", "gtest",
|
|
"jwt-cpp", "libev", "libunwind", "lz4", "openldap", "openssl", "orc", "protobuf",
|
|
"python", "rapidjson", "re2", "snappy", "thrift", "tpc-h", "tpc-ds", "zlib",
|
|
"zstd"])
|
|
# Check whether this platform is supported (or whether a valid custom toolchain
|
|
# has been provided).
|
|
if not try_get_platform_release_label() \
|
|
or not try_get_platform_release_label().toolchain:
|
|
toolchain_packages_home = os.environ.get("IMPALA_TOOLCHAIN_PACKAGES_HOME")
|
|
# This would throw an exception if the custom toolchain were not valid
|
|
check_custom_toolchain(toolchain_packages_home, toolchain_packages)
|
|
# Nothing to download
|
|
return []
|
|
return toolchain_packages
|
|
|
|
|
|
def get_hadoop_downloads():
|
|
cluster_components = []
|
|
hadoop = CdpComponent("hadoop")
|
|
hbase = CdpComponent("hbase", archive_basename_tmpl="hbase-${version}-bin",
|
|
unpack_directory_tmpl="hbase-${version}")
|
|
use_apache_hive = os.environ["USE_APACHE_HIVE"] == "true"
|
|
if use_apache_hive:
|
|
hive = ApacheComponent("hive", archive_basename_tmpl="apache-hive-${version}-bin")
|
|
hive_src = ApacheComponent("hive", archive_basename_tmpl="apache-hive-${version}-src")
|
|
else:
|
|
hive = CdpComponent("hive", archive_basename_tmpl="apache-hive-${version}-bin")
|
|
hive_src = CdpComponent("hive-source",
|
|
explicit_version=os.environ.get("IMPALA_HIVE_VERSION"),
|
|
archive_basename_tmpl="hive-${version}-source",
|
|
unpack_directory_tmpl="hive-${version}")
|
|
tez = CdpComponent("tez", archive_basename_tmpl="tez-${version}-minimal",
|
|
makedir=True)
|
|
use_override_hive = \
|
|
"HIVE_VERSION_OVERRIDE" in os.environ and os.environ["HIVE_VERSION_OVERRIDE"] != ""
|
|
# If we are using a locally built Hive we do not have a need to pull hive as a
|
|
# dependency
|
|
if use_override_hive:
|
|
cluster_components.extend([hadoop, hbase, tez])
|
|
else:
|
|
cluster_components.extend([hadoop, hbase, hive, hive_src, tez])
|
|
# Ranger is always CDP
|
|
cluster_components.append(CdpComponent("ranger",
|
|
archive_basename_tmpl="ranger-${version}-admin"))
|
|
return cluster_components
|
|
|
|
|
|
def get_kudu_downloads():
|
|
# Toolchain Kudu includes Java artifacts.
|
|
return [ToolchainKudu()]
|
|
|
|
|
|
def main():
|
|
"""
|
|
Validates that bin/impala-config.sh has been sourced by verifying that $IMPALA_HOME
|
|
and $IMPALA_TOOLCHAIN_PACKAGES_HOME are in the environment. We assume that if these
|
|
are set, then IMPALA_<PACKAGE>_VERSION environment variables are also set. This will
|
|
create the directory specified by $IMPALA_TOOLCHAIN_PACKAGES_HOME if it does not
|
|
already exist. Then, it will compute what packages need to be downloaded. Packages are
|
|
only downloaded if they are not already present. There are two main categories of
|
|
packages. Toolchain packages are native packages built using the native toolchain.
|
|
These are always downloaded. Hadoop component packages are the CDP builds of Hadoop
|
|
components such as Hadoop, Hive, HBase, etc. Hadoop component packages are organized as
|
|
a consistent set of compatible version via a build number (i.e. CDP_BUILD_NUMBER).
|
|
Hadoop component packages are only downloaded if $DOWNLOAD_CDH_COMPONENTS is true. The
|
|
versions used for Hadoop components come from the CDP versions based on the
|
|
$CDP_BUILD_NUMBER. CDP Hadoop packages are downloaded into $CDP_COMPONENTS_HOME.
|
|
"""
|
|
logging.basicConfig(level=logging.INFO,
|
|
format='%(asctime)s %(threadName)s %(levelname)s: %(message)s')
|
|
# 'sh' module logs at every execution, which is too noisy
|
|
logging.getLogger("sh").setLevel(logging.WARNING)
|
|
|
|
if not os.environ.get("IMPALA_HOME"):
|
|
logging.error("Impala environment not set up correctly, make sure "
|
|
"impala-config.sh is sourced.")
|
|
sys.exit(1)
|
|
|
|
# Create the toolchain directory if necessary
|
|
create_directory_from_env_var("IMPALA_TOOLCHAIN_PACKAGES_HOME")
|
|
|
|
downloads = []
|
|
if os.getenv("SKIP_TOOLCHAIN_BOOTSTRAP", "false") != "true":
|
|
downloads += get_toolchain_downloads()
|
|
kudu_download = None
|
|
if os.getenv("DOWNLOAD_CDH_COMPONENTS", "false") == "true":
|
|
create_directory_from_env_var("CDP_COMPONENTS_HOME")
|
|
create_directory_from_env_var("APACHE_COMPONENTS_HOME")
|
|
if platform.processor() != "aarch64":
|
|
downloads += get_kudu_downloads()
|
|
downloads += get_hadoop_downloads()
|
|
|
|
components_needing_download = [d for d in downloads if d.needs_download()]
|
|
|
|
def download(component):
|
|
component.download()
|
|
|
|
execute_many(download, components_needing_download)
|
|
|
|
|
|
if __name__ == "__main__": main()
|