IMPALA-10606: Simplify impala-python virtualenv bootstrapping

Bootstrapping the impala-python virtualenv requires multiple
rounds of pip installs with different sets of requirements.
This consolidates the requirements.txt, stage2-requirements.txt,
and compiled-requirements.txt into a single requirements.txt.
This will make it easier to upgrade python packages.

This also splits out setuptools into its own
setuptools-requirements.txt. Setuptools is used during the
pip install for several of the dependencies. Recent versions
of setuptools do not support Python 2, but some of the install
tools (like easy_install) don't know how to pick a version
of setuptools that works with Python 2. Splitting it out to its
own requirements file lets us pin the version.

To make review easier, this does not change any of the versions
of the dependencies. It also leaves the stage2-requirements.txt
and compiled-requirements.txt split out in separate sections
of requirements.txt. These will later be turned into a single
alphabetical list.

Testing:
 - Tested impala-python locally
 - Ran GVO

Change-Id: I8e920e5a257f1e0613065685078624a50d59bf2e
Reviewed-on: http://gerrit.cloudera.org:8080/17226
Reviewed-by: Joe McDonnell <joemcdonnell@cloudera.com>
Tested-by: Joe McDonnell <joemcdonnell@cloudera.com>
This commit is contained in:
Joe McDonnell
2021-03-24 14:33:37 -07:00
parent 45fb0fb3e7
commit 1142c7b58e
5 changed files with 88 additions and 121 deletions

View File

@@ -18,21 +18,19 @@
# This module will create a python virtual env and install external dependencies. If the
# virtualenv already exists and it contains all the expected packages, nothing is done.
#
# A multi-step bootstrapping process is required to build and install all of the
# dependencies:
# 1. install basic non-C/C++ packages into the virtualenv
# 1b. install packages that depend on step 1 but cannot be installed together with their
# dependencies
# 2. use the virtualenv Python to bootstrap the toolchain
# 3. use toolchain gcc to build C/C++ packages
# 4. build the kudu-python package with toolchain gcc and Cython
# It is expected that bootstrap_toolchain.py already ran prior to running this
# (and thus the toolchain GCC compiler is in place).
#
# Every time this script is run, it completes as many of the bootstrapping steps as
# possible with the available dependencies.
# The virtualenv creation process involves multiple rounds of pip installs, but
# this script expects to complete all rounds in a single invocation. The steps are:
# 1. Install setuptools and its depenencies. These are used by the setup.py scripts
# that run during pip install.
# 2. Install most packages (including ones that require C/C++ compilation)
# 3. Install Kudu package (which uses the toolchain GCC and the installed Cython)
# 4. Install ADLS packages if applicable
#
# This module can be run with python >= 2.4 but python >= 2.6 must be installed on the
# system. If the default 'python' command refers to < 2.6, python 2.6 will be used
# instead.
# This module can be run with python >= 2.7. It makes no guarantees about usage on
# python < 2.7.
from __future__ import print_function
import glob
@@ -44,7 +42,6 @@ import subprocess
import sys
import tarfile
import tempfile
import textwrap
import urllib
from bootstrap_toolchain import ToolchainPackage
@@ -57,25 +54,27 @@ GCC_VERSION = os.environ["IMPALA_GCC_VERSION"]
DEPS_DIR = os.path.join(os.path.dirname(__file__), "deps")
ENV_DIR = os.path.join(os.path.dirname(__file__), "env-gcc{0}".format(GCC_VERSION))
# Requirements file with packages we need for our build and tests.
# Setuptools requirements file. Setuptools is required during pip install for
# some packages. Newer setuptools dropped python 2 support, and some python
# install tools don't understand that they need to get a version that works
# with the current python version. This can cause them to try to install the newer
# setuptools that won't work on python 2. Doing this as a separate step makes it
# easy to pin the version of setuptools to a Python 2 compatible version.
SETUPTOOLS_REQS_PATH = os.path.join(DEPS_DIR, "setuptools-requirements.txt")
# Requirements file with packages we need for our build and tests, which depends
# on setuptools being installed by the setuptools requirements step.
REQS_PATH = os.path.join(DEPS_DIR, "requirements.txt")
# Second stage of requirements which cannot be installed together with their dependencies
# in requirements.txt.
REQS2_PATH = os.path.join(DEPS_DIR, "stage2-requirements.txt")
# Requirements for the next bootstrapping step that builds compiled requirements
# with toolchain gcc.
COMPILED_REQS_PATH = os.path.join(DEPS_DIR, "compiled-requirements.txt")
# Requirements for the Kudu bootstrapping step, which depends on Cython being installed
# by the compiled requirements step.
# by the requirements step.
KUDU_REQS_PATH = os.path.join(DEPS_DIR, "kudu-requirements.txt")
# Requirements for the ADLS test client step, which depends on Cffi (C Foreign Function
# Interface) being installed by the compiled requirements step.
# Interface) being installed by the requirements step.
ADLS_REQS_PATH = os.path.join(DEPS_DIR, "adls-requirements.txt")
def delete_virtualenv_if_exist():
if os.path.exists(ENV_DIR):
shutil.rmtree(ENV_DIR)
@@ -108,6 +107,7 @@ def exec_cmd(args, **kwargs):
% (args, output))
return output
def use_ccache():
'''Returns true if ccache is available and should be used'''
if 'DISABLE_CCACHE' in os.environ: return False
@@ -117,6 +117,7 @@ def use_ccache():
except:
return False
def select_cc():
'''Return the C compiler command that should be used as a string or None if the
compiler is not available '''
@@ -129,6 +130,7 @@ def select_cc():
if use_ccache(): cc = "ccache %s" % cc
return cc
def exec_pip_install(args, cc="no-cc-available", env=None):
'''Executes "pip install" with the provided command line arguments. If 'cc' is set,
it is used as the C compiler. Otherwise compilation of C/C++ code is disabled by
@@ -218,48 +220,28 @@ def download_toolchain_python():
def install_deps():
LOG.info("Installing setuptools into the virtualenv")
exec_pip_install(["-r", SETUPTOOLS_REQS_PATH])
cc = select_cc()
if cc is None:
raise Exception("CC not available")
env = dict(os.environ)
LOG.info("Installing packages into the virtualenv")
exec_pip_install(["-r", REQS_PATH])
exec_pip_install(["-r", REQS_PATH], cc=cc, env=env)
mark_reqs_installed(REQS_PATH)
LOG.info("Installing stage 2 packages into the virtualenv")
exec_pip_install(["-r", REQS2_PATH])
mark_reqs_installed(REQS2_PATH)
def have_toolchain():
'''Return true if the Impala toolchain is available'''
return "IMPALA_TOOLCHAIN_PACKAGES_HOME" in os.environ
def toolchain_pkg_dir(pkg_name):
'''Return the path to the toolchain package'''
pkg_version = os.environ["IMPALA_" + pkg_name.upper() + "_VERSION"]
return os.path.join(os.environ["IMPALA_TOOLCHAIN_PACKAGES_HOME"],
pkg_name + "-" + pkg_version)
def install_compiled_deps_if_possible():
'''Install dependencies that require compilation with toolchain GCC, if the toolchain
is available. Returns true if the deps are installed'''
if reqs_are_installed(COMPILED_REQS_PATH):
LOG.debug("Skipping compiled deps: matching compiled-installed-requirements.txt found")
return True
cc = select_cc()
if cc is None:
LOG.debug("Skipping compiled deps: cc not available yet")
return False
env = dict(os.environ)
# Compilation of pycrypto fails on CentOS 5 with newer GCC versions because of a
# problem with inline declarations in older libc headers. Setting -fgnu89-inline is a
# workaround.
distro_version = ''.join(exec_cmd(["lsb_release", "-irs"]).lower().split())
print(distro_version)
if distro_version.startswith("centos5."):
env["CFLAGS"] = "-fgnu89-inline"
LOG.info("Installing compiled requirements into the virtualenv")
exec_pip_install(["-r", COMPILED_REQS_PATH], cc=cc, env=env)
mark_reqs_installed(COMPILED_REQS_PATH)
return True
def install_adls_deps():
# The ADLS dependencies require that the OS is at least CentOS 6.7 or above,
@@ -275,9 +257,10 @@ def install_adls_deps():
exec_pip_install(["-r", ADLS_REQS_PATH], cc=cc)
mark_reqs_installed(ADLS_REQS_PATH)
def install_kudu_client_if_possible():
'''Installs the Kudu python module if possible, which depends on the toolchain and
the compiled requirements in compiled-requirements.txt. If the toolchain isn't
the compiled requirements in requirements.txt. If the toolchain isn't
available, nothing will be done.'''
if reqs_are_installed(KUDU_REQS_PATH):
LOG.debug("Skipping Kudu: matching kudu-installed-requirements.txt found")
@@ -344,12 +327,14 @@ def error_if_kudu_client_not_found(install_dir):
return
raise Exception("%s not found at %s" % (kudu_client_lib, lib_dir))
def mark_reqs_installed(reqs_path):
'''Mark that the requirements from the given file are installed by copying it into the root
directory of the virtualenv.'''
'''Mark that the requirements from the given file are installed by copying it into
the root directory of the virtualenv.'''
installed_reqs_path = os.path.join(ENV_DIR, os.path.basename(reqs_path))
shutil.copyfile(reqs_path, installed_reqs_path)
def reqs_are_installed(reqs_path):
'''Check if the requirements from the given file are installed in the virtualenv by
looking for a matching requirements file in the root directory of the virtualenv.'''
@@ -370,8 +355,9 @@ def reqs_are_installed(reqs_path):
finally:
installed_reqs_file.close()
def setup_virtualenv_if_not_exists():
if not (reqs_are_installed(REQS_PATH) and reqs_are_installed(REQS2_PATH)):
if not (reqs_are_installed(REQS_PATH)):
delete_virtualenv_if_exist()
create_virtualenv()
install_deps()
@@ -405,6 +391,5 @@ if __name__ == "__main__":
# Complete as many bootstrap steps as possible (see file comment for the steps).
setup_virtualenv_if_not_exists()
if install_compiled_deps_if_possible():
install_kudu_client_if_possible()
install_adls_deps()
install_kudu_client_if_possible()
install_adls_deps()

View File

@@ -37,9 +37,8 @@ NUM_DOWNLOAD_ATTEMPTS = 8
PYPI_MIRROR = os.environ.get('PYPI_MIRROR', 'https://pypi.python.org')
# The requirement files that list all of the required packages and versions.
REQUIREMENTS_FILES = ['requirements.txt', 'stage2-requirements.txt',
'compiled-requirements.txt', 'kudu-requirements.txt',
'adls-requirements.txt']
REQUIREMENTS_FILES = ['requirements.txt', 'setuptools-requirements.txt',
'kudu-requirements.txt', 'adls-requirements.txt']
def check_digest(filename, algorithm, expected_digest):

View File

@@ -48,9 +48,6 @@ python-magic == 0.4.11
# attempting to install pywebhdfs (https://github.com/pywebhdfs/pywebhdfs/issues/52).
# pywebhdfs itself will be installed in stage 2.
pbr == 3.1.1
# Newer versions of setuptools don't support Python 2.6
setuptools == 36.8.0
setuptools-scm == 1.15.4
sh == 1.11
six == 1.14.0
sqlparse == 0.3.1
@@ -61,3 +58,40 @@ ipython == 1.2.1
apipkg == 1.4
virtualenv == 13.1.0
#### Formerly stage2-requirements.txt
# Requires setuptools-scm
pytest == 2.9.2
py == 1.4.32
pytest-forked == 0.2
pytest-random == 0.02
pytest-runner == 4.2
pytest-xdist == 1.17.1
pytest-timeout == 1.2.1
hdfs == 2.0.2
docopt == 0.6.2
execnet == 1.4.0
# Requires pbr
pywebhdfs == 0.3.2
requests == 2.20.0
chardet == 3.0.4
idna == 2.8
urllib3 == 1.21.1
certifi == 2020.12.5
#### Formerly compiled-requirements.txt
argparse == 1.4.0
impyla == 0.17a1
bitarray == 1.2.1
sasl == 0.2.1
# six == 1.14.0 (specified above)
thrift_sasl == 0.4.2
psutil == 5.6.3
# Required for Kudu:
Cython == 0.23.4
numpy == 1.10.4
pytz == 2018.3

View File

@@ -15,17 +15,6 @@
# specific language governing permissions and limitations
# under the License.
# Requirements that require a C/C++ compiler to build, which may not be available until
# after the toolchain is bootstrapped. Installed after requirements.txt
argparse == 1.4.0
impyla == 0.17a1
bitarray == 1.2.1
sasl == 0.2.1
six == 1.14.0
thrift_sasl == 0.4.2
psutil == 5.6.3
# Required for Kudu:
Cython == 0.23.4
numpy == 1.10.4
pytz == 2018.3
# Newer versions of setuptools don't support Python 2.6
setuptools == 36.8.0
setuptools-scm == 1.15.4

View File

@@ -1,40 +0,0 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# This file contains packages that have dependencies in requirements.txt and that have to
# be installed in a separate invocation of pip.
# Requires setuptools-scm
pytest == 2.9.2
py == 1.4.32
pytest-forked == 0.2
pytest-random == 0.02
pytest-runner == 4.2
pytest-xdist == 1.17.1
pytest-timeout == 1.2.1
hdfs == 2.0.2
docopt == 0.6.2
execnet == 1.4.0
# Requires pbr
pywebhdfs == 0.3.2
requests == 2.20.0
chardet == 3.0.4
idna == 2.8
urllib3 == 1.21.1
certifi == 2020.12.5