# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. # This module will create a python virtual env and install external dependencies. If the # virtualenv already exists and it contains all the expected packages, nothing is done. # # It is expected that bootstrap_toolchain.py already ran prior to running this # (and thus the toolchain GCC compiler is in place). # # The virtualenv creation process involves multiple rounds of pip installs, but # this script expects to complete all rounds in a single invocation. The steps are: # 1. Install setuptools and its depenencies. These are used by the setup.py scripts # that run during pip install. # 2. Install most packages (including ones that require C/C++ compilation) # 3. Install Kudu package (which uses the toolchain GCC and the installed Cython) # 4. Install ADLS packages if applicable # # This module can be run with python >= 2.7. It makes no guarantees about usage on # python < 2.7. from __future__ import print_function import glob import logging import optparse import os import shutil import subprocess import sys import tarfile import tempfile try: from urllib.request import pathname2url except ImportError: from urllib import pathname2url from bootstrap_toolchain import ToolchainPackage LOG = logging.getLogger(os.path.splitext(os.path.basename(__file__))[0]) SKIP_TOOLCHAIN_BOOTSTRAP = "SKIP_TOOLCHAIN_BOOTSTRAP" GCC_VERSION = os.environ["IMPALA_GCC_VERSION"] DEPS_DIR = os.path.join(os.path.dirname(__file__), "deps") ENV_DIR = os.path.join(os.path.dirname(__file__), "env-gcc{0}".format(GCC_VERSION)) # Setuptools requirements file. Setuptools is required during pip install for # some packages. Newer setuptools dropped python 2 support, and some python # install tools don't understand that they need to get a version that works # with the current python version. This can cause them to try to install the newer # setuptools that won't work on python 2. Doing this as a separate step makes it # easy to pin the version of setuptools to a Python 2 compatible version. SETUPTOOLS_REQS_PATH = os.path.join(DEPS_DIR, "setuptools-requirements.txt") # Requirements file with packages we need for our build and tests, which depends # on setuptools being installed by the setuptools requirements step. REQS_PATH = os.path.join(DEPS_DIR, "requirements.txt") # Requirements for the Kudu bootstrapping step, which depends on Cython being installed # by the requirements step. KUDU_REQS_PATH = os.path.join(DEPS_DIR, "kudu-requirements.txt") # Requirements for the ADLS test client step, which depends on Cffi (C Foreign Function # Interface) being installed by the requirements step. ADLS_REQS_PATH = os.path.join(DEPS_DIR, "adls-requirements.txt") def delete_virtualenv_if_exist(): if os.path.exists(ENV_DIR): shutil.rmtree(ENV_DIR) def detect_virtualenv_version(): with open(REQS_PATH, "r") as reqs_file: for line in reqs_file: line = line.strip() # Ignore blank lines and comments if len(line) == 0 or line[0] == '#': continue if line.find("virtualenv") != -1 and line.find("==") != -1: packagestring, version = [a.strip() for a in line.split("==")] if packagestring == "virtualenv": LOG.debug("Detected virtualenv version {0}".format(version)) return version # If the parsing didn't work, don't raise an exception. return None def create_virtualenv(): LOG.info("Creating python virtualenv") build_dir = tempfile.mkdtemp() # Try to find the virtualenv version by parsing the requirements file # Default to "*" if we can't figure it out. virtualenv_version = detect_virtualenv_version() if virtualenv_version is None: virtualenv_version = "*" # Open the virtualenv tarball virtualenv_tarball = \ find_file(DEPS_DIR, "virtualenv-{0}.tar.gz".format(virtualenv_version)) file = tarfile.open(virtualenv_tarball, "r:gz") for member in file.getmembers(): file.extract(member, build_dir) file.close() python_cmd = download_toolchain_python() exec_cmd([python_cmd, find_file(build_dir, "virtualenv*", "virtualenv.py"), "--quiet", "--python", python_cmd, ENV_DIR]) shutil.rmtree(build_dir) def exec_cmd(args, **kwargs): '''Executes a command and waits for it to finish, raises an exception if the return status is not zero. The command output is returned. 'args' and 'kwargs' use the same format as subprocess.Popen(). ''' process = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True, **kwargs) output = process.communicate()[0] if process.returncode != 0: raise Exception("Command returned non-zero status\nCommand: %s\nOutput: %s" % (args, output)) return output def select_cc(): '''Return the C compiler command that should be used as a string or None if the compiler is not available ''' # Use toolchain gcc for ABI compatibility with other toolchain packages, e.g. # Kudu/kudu-python if not have_toolchain(): return None toolchain_gcc_dir = toolchain_pkg_dir("gcc") cc = os.path.join(toolchain_gcc_dir, "bin/gcc") if not os.path.exists(cc): return None return cc def exec_pip_install(args, cc="no-cc-available", env=None): '''Executes "pip install" with the provided command line arguments. If 'cc' is set, it is used as the C compiler. Otherwise compilation of C/C++ code is disabled by setting the CC environment variable to a bogus value. Other environment vars can optionally be set with the 'env' argument. By default the current process's command line arguments are inherited.''' if not env: env = dict(os.environ) env["CC"] = cc # Since gcc is now built with toolchain binutils which may be newer than the # system binutils, we need to include the toolchain binutils on the PATH. toolchain_binutils_dir = toolchain_pkg_dir("binutils") binutils_bin_dir = os.path.join(toolchain_binutils_dir, "bin") env["PATH"] = "{0}:{1}".format(binutils_bin_dir, env["PATH"]) # Parallelize the slow numpy build. # Use getconf instead of nproc because it is supported more widely, e.g. on older # linux distributions. env["NPY_NUM_BUILD_JOBS"] = exec_cmd(["getconf", "_NPROCESSORS_ONLN"]).strip() # Don't call the virtualenv pip directly, it uses a hashbang to to call the python # virtualenv using an absolute path. If the path to the virtualenv is very long, the # hashbang won't work. impala_pip_base_cmd = [os.path.join(ENV_DIR, "bin", "python"), os.path.join(ENV_DIR, "bin", "pip"), "install", "-v"] # Passes --no-binary for IMPALA-3767: without this, Cython (and # several other packages) fail download. # # --no-cache-dir is used to prevent caching of compiled artifacts, which may be built # with different compilers or settings. third_party_pkg_install_cmd = \ impala_pip_base_cmd[:] + ["--no-binary", ":all:", "--no-cache-dir"] # When using a custom mirror, we also must use the index of that mirror. if "PYPI_MIRROR" in os.environ: third_party_pkg_install_cmd.extend(["--index-url", "%s/simple" % os.environ["PYPI_MIRROR"]]) else: # Prevent fetching additional packages from the index. If we forget to add a package # to one of the requirements.txt files, this should trigger an error. However, we will # still access the index for version/dependency resolution, hence we need to change it # when using a private mirror. third_party_pkg_install_cmd.append("--no-index") third_party_pkg_install_cmd.extend(["--find-links", "file://%s" % pathname2url(os.path.abspath(DEPS_DIR))]) third_party_pkg_install_cmd.extend(args) exec_cmd(third_party_pkg_install_cmd, env=env) # Finally, we want to install the packages from our own internal python lib local_package_install_cmd = impala_pip_base_cmd + \ ['-e', os.path.join(os.getenv('IMPALA_HOME'), 'lib', 'python')] exec_cmd(local_package_install_cmd) def find_file(*paths): '''Returns the path specified by the glob 'paths', raises an exception if no file is found. Ex: find_file('/etc', 'h*sts') --> /etc/hosts ''' path = os.path.join(*paths) files = glob.glob(path) if len(files) > 1: raise Exception("Found too many files at %s: %s" % (path, files)) if len(files) == 0: raise Exception("No file found at %s" % path) return files[0] def download_toolchain_python(): '''Grabs the Python implementation from the Impala toolchain, using the machinery from bin/bootstrap_toolchain.py. Skip the download if SKIP_TOOLCHAIN_BOOTSTRAP=true in the environment. In that case only the presence of the Python executable is checked in the toolchain location. ''' toolchain_packages_home = os.environ.get("IMPALA_TOOLCHAIN_PACKAGES_HOME") if not toolchain_packages_home: raise Exception("Impala environment not set up correctly, make sure " "$IMPALA_TOOLCHAIN_PACKAGES_HOME is set.") package = ToolchainPackage("python") if package.needs_download() and \ not (os.environ.get(SKIP_TOOLCHAIN_BOOTSTRAP) == 'true'): package.download() python_cmd = os.path.join(package.pkg_directory(), "bin/python") if not os.path.exists(python_cmd): raise Exception("Unexpected error bootstrapping python from toolchain: {0} does not " "exist".format(python_cmd)) return python_cmd def install_deps(): LOG.info("Installing setuptools into the virtualenv") exec_pip_install(["-r", SETUPTOOLS_REQS_PATH]) cc = select_cc() if cc is None: raise Exception("CC not available") env = dict(os.environ) LOG.info("Installing packages into the virtualenv") exec_pip_install(["-r", REQS_PATH], cc=cc, env=env) mark_reqs_installed(REQS_PATH) def have_toolchain(): '''Return true if the Impala toolchain is available''' return "IMPALA_TOOLCHAIN_PACKAGES_HOME" in os.environ def toolchain_pkg_dir(pkg_name): '''Return the path to the toolchain package''' pkg_version = os.environ["IMPALA_" + pkg_name.upper() + "_VERSION"] return os.path.join(os.environ["IMPALA_TOOLCHAIN_PACKAGES_HOME"], pkg_name + "-" + pkg_version) def install_adls_deps(): # The ADLS dependencies require that the OS is at least CentOS 6.7 or above, # which is why we break this into a seperate step. If the target filesystem is # ADLS, the expectation is that the dev environment is running at least CentOS 6.7. if os.environ.get('TARGET_FILESYSTEM') == "adls": if reqs_are_installed(ADLS_REQS_PATH): LOG.debug("Skipping ADLS deps: matching adls-installed-requirements.txt found") return True cc = select_cc() assert cc is not None LOG.info("Installing ADLS packages into the virtualenv") exec_pip_install(["-r", ADLS_REQS_PATH], cc=cc) mark_reqs_installed(ADLS_REQS_PATH) def install_kudu_client_if_possible(): '''Installs the Kudu python module if possible, which depends on the toolchain and the compiled requirements in requirements.txt. If the toolchain isn't available, nothing will be done.''' if reqs_are_installed(KUDU_REQS_PATH): LOG.debug("Skipping Kudu: matching kudu-installed-requirements.txt found") return kudu_base_dir = os.environ["IMPALA_KUDU_HOME"] if not os.path.exists(kudu_base_dir): LOG.debug("Skipping Kudu: %s doesn't exist" % kudu_base_dir) return LOG.info("Installing Kudu into the virtualenv") # The installation requires that KUDU_HOME/build/latest exists. An empty directory # structure will be made to satisfy that. The Kudu client headers and lib will be made # available through GCC environment variables. fake_kudu_build_dir = os.path.join(tempfile.gettempdir(), "virtualenv-kudu") try: artifact_dir = os.path.join(fake_kudu_build_dir, "build", "latest") if not os.path.exists(artifact_dir): os.makedirs(artifact_dir) cc = select_cc() assert cc is not None env = dict(os.environ) env["KUDU_HOME"] = fake_kudu_build_dir kudu_client_dir = find_kudu_client_install_dir() # Copy the include directory to the fake build directory kudu_include_dir = os.path.join(kudu_client_dir, "include") shutil.copytree(kudu_include_dir, os.path.join(fake_kudu_build_dir, "build", "latest", "src")) env["CPLUS_INCLUDE_PATH"] = os.path.join(kudu_client_dir, "include") env["LIBRARY_PATH"] = os.path.pathsep.join([os.path.join(kudu_client_dir, 'lib'), os.path.join(kudu_client_dir, 'lib64')]) exec_pip_install(["-r", KUDU_REQS_PATH], cc=cc, env=env) mark_reqs_installed(KUDU_REQS_PATH) finally: try: shutil.rmtree(fake_kudu_build_dir) except Exception: LOG.debug("Error removing temp Kudu build dir", exc_info=True) def find_kudu_client_install_dir(): custom_client_dir = os.environ["KUDU_CLIENT_DIR"] if custom_client_dir: install_dir = os.path.join(custom_client_dir, "usr", "local") error_if_kudu_client_not_found(install_dir) else: # If the toolchain appears to have been setup already, then the Kudu client is # required to exist. It's possible that the toolchain won't be setup yet though # since the toolchain bootstrap script depends on the virtualenv. kudu_base_dir = os.environ["IMPALA_KUDU_HOME"] install_dir = os.path.join(kudu_base_dir, "debug") if os.path.exists(kudu_base_dir): error_if_kudu_client_not_found(install_dir) return install_dir def error_if_kudu_client_not_found(install_dir): header_path = os.path.join(install_dir, "include", "kudu", "client", "client.h") if not os.path.exists(header_path): raise Exception("Kudu client header not found at %s" % header_path) kudu_client_lib = "libkudu_client.so" lib_dir = os.path.join(install_dir, "lib64") if not os.path.exists(lib_dir): lib_dir = os.path.join(install_dir, "lib") for _, _, files in os.walk(lib_dir): for file in files: if file == kudu_client_lib: return raise Exception("%s not found at %s" % (kudu_client_lib, lib_dir)) def mark_reqs_installed(reqs_path): '''Mark that the requirements from the given file are installed by copying it into the root directory of the virtualenv.''' installed_reqs_path = os.path.join(ENV_DIR, os.path.basename(reqs_path)) shutil.copyfile(reqs_path, installed_reqs_path) def reqs_are_installed(reqs_path): '''Check if the requirements from the given file are installed in the virtualenv by looking for a matching requirements file in the root directory of the virtualenv.''' installed_reqs_path = os.path.join(ENV_DIR, os.path.basename(reqs_path)) if not os.path.exists(installed_reqs_path): return False installed_reqs_file = open(installed_reqs_path) try: reqs_file = open(reqs_path) try: if reqs_file.read() == installed_reqs_file.read(): return True else: LOG.debug("Virtualenv upgrade needed") return False finally: reqs_file.close() finally: installed_reqs_file.close() def setup_virtualenv_if_not_exists(): if not (reqs_are_installed(REQS_PATH)): delete_virtualenv_if_exist() create_virtualenv() install_deps() LOG.debug("Virtualenv setup complete") if __name__ == "__main__": parser = optparse.OptionParser() parser.add_option("-l", "--log-level", default="INFO", choices=("DEBUG", "INFO", "WARN", "ERROR")) parser.add_option("-r", "--rebuild", action="store_true", help="Force a rebuild of" " the virtualenv even if it exists and appears to be completely up-to-date.") parser.add_option("--print-ld-library-path", action="store_true", help="Print the" " LD_LIBRARY_PATH that should be used when running python from the virtualenv.") options, args = parser.parse_args() if options.print_ld_library_path: # Some python packages have native code that is compiled with the toolchain # compiler, so that code needs to dynamically link against matching library # versions. ld_library_dirs = [os.path.join(toolchain_pkg_dir("gcc"), 'lib64')] kudu_client_dir = find_kudu_client_install_dir() ld_library_dirs.append(os.path.join(kudu_client_dir, 'lib')) ld_library_dirs.append(os.path.join(kudu_client_dir, 'lib64')) print(os.path.pathsep.join(ld_library_dirs)) sys.exit() logging.basicConfig(level=getattr(logging, options.log_level)) if options.rebuild: delete_virtualenv_if_exist() # Complete as many bootstrap steps as possible (see file comment for the steps). setup_virtualenv_if_not_exists() install_kudu_client_if_possible() install_adls_deps()