IMPALA-11959: Add Python 3 virtualenv

This adds a Python 3 equivalent to the impala-python
virtualenv base on the toolchain Python 3.7.16.
This modifies bootstrap_virtualenv.py to support
the two different modes. This adds py2-requirements.txt
and py3-requirements.txt to allow some differences
between the Python 2 and Python 3 virtualenvs.

Here are some specific package changes:
 - allpairs is replaced with allpairspy, as allpairs did
   not support Python 3.
 - requests is upgraded slightly, because otherwise is has issues
   with idna==2.8.
 - pylint is limited to Python 3, because we are adding it
   and don't need it on both
 - flake8 is limited to Python 2, because it will take
   some work to switch to a version that works on Python 3
 - cm_api is limited to Python 2, because it doesn't support
   Python 3
 - pytest-random does not support Python 3 and it is unused,
   so it is removed
 - Bump the version of setuptool-scm to support Python 3

This adds impala-pylint, which can be used to do further
Python 3 checks via --py3k. This also adds a bin/check-pylint-py3k.sh
script to enforce specific py3k checks. The banned py3k warnings
are specified in the bin/banned_py3k_warnings.txt. This is currently
empty, but this can ratchet up the py3k strictness over time
to avoid regressions.

This pulls in a new toolchain with the fix for IMPALA-11956
to get Python 3.7.16.

Testing:
 - Hand tested that the allpairs libraries produce the
   same results
 - The python3 virtualenv has no influence on regular
   tests yet

Change-Id: Ica4853f440c9a46a79bd5fb8e0a66730b0b4efc0
Reviewed-on: http://gerrit.cloudera.org:8080/19567
Reviewed-by: Joe McDonnell <joemcdonnell@cloudera.com>
Tested-by: Joe McDonnell <joemcdonnell@cloudera.com>
This commit is contained in:
Joe McDonnell
2021-04-06 21:36:07 -07:00
parent 148888e3ed
commit 566df80891
21 changed files with 452 additions and 75 deletions

View File

@@ -55,7 +55,10 @@ SKIP_TOOLCHAIN_BOOTSTRAP = "SKIP_TOOLCHAIN_BOOTSTRAP"
GCC_VERSION = os.environ["IMPALA_GCC_VERSION"]
DEPS_DIR = os.path.join(os.path.dirname(__file__), "deps")
ENV_DIR = os.path.join(os.path.dirname(__file__), "env-gcc{0}".format(GCC_VERSION))
ENV_DIR_PY2 = os.path.join(os.path.dirname(__file__),
"env-gcc{0}".format(GCC_VERSION))
ENV_DIR_PY3 = os.path.join(os.path.dirname(__file__),
"env-gcc{0}-py3".format(GCC_VERSION))
# Setuptools requirements file. Setuptools is required during pip install for
# some packages. Newer setuptools dropped python 2 support, and some python
@@ -77,10 +80,16 @@ KUDU_REQS_PATH = os.path.join(DEPS_DIR, "kudu-requirements.txt")
# Interface) being installed by the requirements step.
ADLS_REQS_PATH = os.path.join(DEPS_DIR, "adls-requirements.txt")
# Extra packages specific to python 3
PY3_REQS_PATH = os.path.join(DEPS_DIR, "py3-requirements.txt")
def delete_virtualenv_if_exist():
if os.path.exists(ENV_DIR):
shutil.rmtree(ENV_DIR)
# Extra packages specific to python 2
PY2_REQS_PATH = os.path.join(DEPS_DIR, "py2-requirements.txt")
def delete_virtualenv_if_exist(venv_dir):
if os.path.exists(venv_dir):
shutil.rmtree(venv_dir)
def detect_virtualenv_version():
@@ -99,8 +108,16 @@ def detect_virtualenv_version():
return None
def create_virtualenv():
LOG.info("Creating python virtualenv")
def create_virtualenv(venv_dir, is_py3):
if is_py3:
# Python 3 is much simpler, because there is a builtin venv command
LOG.info("Creating python3 virtualenv")
python_cmd = download_toolchain_python(is_py3)
exec_cmd([python_cmd, "-m" "venv", venv_dir])
return
# Python 2
LOG.info("Creating python2 virtualenv")
build_dir = tempfile.mkdtemp()
# Try to find the virtualenv version by parsing the requirements file
# Default to "*" if we can't figure it out.
@@ -114,9 +131,9 @@ def create_virtualenv():
for member in file.getmembers():
file.extract(member, build_dir)
file.close()
python_cmd = download_toolchain_python()
python_cmd = download_toolchain_python(is_py3)
exec_cmd([python_cmd, find_file(build_dir, "virtualenv*", "virtualenv.py"), "--quiet",
"--python", python_cmd, ENV_DIR])
"--python", python_cmd, venv_dir])
shutil.rmtree(build_dir)
@@ -147,7 +164,7 @@ def select_cc():
return cc
def exec_pip_install(args, cc="no-cc-available", env=None):
def exec_pip_install(venv_dir, is_py3, args, cc="no-cc-available", env=None):
'''Executes "pip install" with the provided command line arguments. If 'cc' is set,
it is used as the C compiler. Otherwise compilation of C/C++ code is disabled by
setting the CC environment variable to a bogus value.
@@ -169,8 +186,12 @@ def exec_pip_install(args, cc="no-cc-available", env=None):
# Don't call the virtualenv pip directly, it uses a hashbang to to call the python
# virtualenv using an absolute path. If the path to the virtualenv is very long, the
# hashbang won't work.
impala_pip_base_cmd = [os.path.join(ENV_DIR, "bin", "python"),
os.path.join(ENV_DIR, "bin", "pip"), "install", "-v"]
if is_py3:
impala_pip_base_cmd = [os.path.join(venv_dir, "bin", "python3"),
os.path.join(venv_dir, "bin", "pip3"), "install", "-v"]
else:
impala_pip_base_cmd = [os.path.join(venv_dir, "bin", "python"),
os.path.join(venv_dir, "bin", "pip"), "install", "-v"]
# Passes --no-binary for IMPALA-3767: without this, Cython (and
# several other packages) fail download.
@@ -181,7 +202,9 @@ def exec_pip_install(args, cc="no-cc-available", env=None):
impala_pip_base_cmd[:] + ["--no-binary", ":all:", "--no-cache-dir"]
# When using a custom mirror, we also must use the index of that mirror.
if "PYPI_MIRROR" in os.environ:
# The python 3 virtualenv has trouble with using --index-url with PYPI_MIRROR,
# so it falls back to --no-index, which works fine.
if "PYPI_MIRROR" in os.environ and not is_py3:
third_party_pkg_install_cmd.extend(["--index-url",
"%s/simple" % os.environ["PYPI_MIRROR"]])
else:
@@ -217,7 +240,7 @@ def find_file(*paths):
return files[0]
def download_toolchain_python():
def download_toolchain_python(is_py3):
'''Grabs the Python implementation from the Impala toolchain, using the machinery from
bin/bootstrap_toolchain.py.
Skip the download if SKIP_TOOLCHAIN_BOOTSTRAP=true in the environment. In that case
@@ -229,27 +252,35 @@ def download_toolchain_python():
raise Exception("Impala environment not set up correctly, make sure "
"$IMPALA_TOOLCHAIN_PACKAGES_HOME is set.")
package = ToolchainPackage("python")
if is_py3:
package = ToolchainPackage("python",
explicit_version=os.environ["IMPALA_PYTHON3_VERSION"])
else:
package = ToolchainPackage("python")
if package.needs_download() and \
not (os.environ.get(SKIP_TOOLCHAIN_BOOTSTRAP) == 'true'):
package.download()
python_cmd = os.path.join(package.pkg_directory(), "bin/python")
if is_py3:
python_cmd = os.path.join(package.pkg_directory(), "bin/python3")
else:
python_cmd = os.path.join(package.pkg_directory(), "bin/python")
if not os.path.exists(python_cmd):
raise Exception("Unexpected error bootstrapping python from toolchain: {0} does not "
"exist".format(python_cmd))
return python_cmd
def install_deps():
LOG.info("Installing setuptools into the virtualenv")
exec_pip_install(["-r", SETUPTOOLS_REQS_PATH])
def install_deps(venv_dir, is_py3):
py_str = "3" if is_py3 else "2"
LOG.info("Installing setuptools into the python{0} virtualenv".format(py_str))
exec_pip_install(venv_dir, is_py3, ["-r", SETUPTOOLS_REQS_PATH])
cc = select_cc()
if cc is None:
raise Exception("CC not available")
env = dict(os.environ)
LOG.info("Installing packages into the virtualenv")
exec_pip_install(["-r", REQS_PATH], cc=cc, env=env)
mark_reqs_installed(REQS_PATH)
LOG.info("Installing packages into the python{0} virtualenv".format(py_str))
exec_pip_install(venv_dir, is_py3, ["-r", REQS_PATH], cc=cc, env=env)
mark_reqs_installed(venv_dir, REQS_PATH)
def have_toolchain():
@@ -264,26 +295,44 @@ def toolchain_pkg_dir(pkg_name):
pkg_name + "-" + pkg_version)
def install_adls_deps():
def install_adls_deps(venv_dir, is_py3):
# The ADLS dependencies require that the OS is at least CentOS 6.7 or above,
# which is why we break this into a seperate step. If the target filesystem is
# ADLS, the expectation is that the dev environment is running at least CentOS 6.7.
if os.environ.get('TARGET_FILESYSTEM') == "adls":
if reqs_are_installed(ADLS_REQS_PATH):
if reqs_are_installed(venv_dir, ADLS_REQS_PATH):
LOG.debug("Skipping ADLS deps: matching adls-installed-requirements.txt found")
return True
cc = select_cc()
assert cc is not None
LOG.info("Installing ADLS packages into the virtualenv")
exec_pip_install(["-r", ADLS_REQS_PATH], cc=cc)
mark_reqs_installed(ADLS_REQS_PATH)
py_str = "3" if is_py3 else "2"
LOG.info("Installing ADLS packages into the python{0} virtualenv".format(py_str))
exec_pip_install(venv_dir, is_py3, ["-r", ADLS_REQS_PATH], cc=cc)
mark_reqs_installed(venv_dir, ADLS_REQS_PATH)
def install_kudu_client_if_possible():
def install_py_version_deps(venv_dir, is_py3):
cc = select_cc()
assert cc is not None
if not is_py3:
if not reqs_are_installed(venv_dir, PY2_REQS_PATH):
# These are extra python2-only packages
LOG.info("Installing python2 packages into the virtualenv")
exec_pip_install(venv_dir, is_py3, ["-r", PY2_REQS_PATH], cc=cc)
mark_reqs_installed(venv_dir, PY2_REQS_PATH)
else:
if not reqs_are_installed(venv_dir, PY3_REQS_PATH):
# These are extra python3-only packages
LOG.info("Installing python3 packages into the virtualenv")
exec_pip_install(venv_dir, is_py3, ["-r", PY3_REQS_PATH], cc=cc)
mark_reqs_installed(venv_dir, PY3_REQS_PATH)
def install_kudu_client_if_possible(venv_dir, is_py3):
'''Installs the Kudu python module if possible, which depends on the toolchain and
the compiled requirements in requirements.txt. If the toolchain isn't
available, nothing will be done.'''
if reqs_are_installed(KUDU_REQS_PATH):
if reqs_are_installed(venv_dir, KUDU_REQS_PATH):
LOG.debug("Skipping Kudu: matching kudu-installed-requirements.txt found")
return
kudu_base_dir = os.environ["IMPALA_KUDU_HOME"]
@@ -291,11 +340,13 @@ def install_kudu_client_if_possible():
LOG.debug("Skipping Kudu: %s doesn't exist" % kudu_base_dir)
return
LOG.info("Installing Kudu into the virtualenv")
py_str = "3" if is_py3 else "2"
LOG.info("Installing Kudu into the python{0} virtualenv".format(py_str))
# The installation requires that KUDU_HOME/build/latest exists. An empty directory
# structure will be made to satisfy that. The Kudu client headers and lib will be made
# available through GCC environment variables.
fake_kudu_build_dir = os.path.join(tempfile.gettempdir(), "virtualenv-kudu")
fake_kudu_build_dir = os.path.join(tempfile.gettempdir(),
"virtualenv-kudu{0}".format(py_str))
try:
artifact_dir = os.path.join(fake_kudu_build_dir, "build", "latest")
if not os.path.exists(artifact_dir):
@@ -312,8 +363,8 @@ def install_kudu_client_if_possible():
env["CPLUS_INCLUDE_PATH"] = os.path.join(kudu_client_dir, "include")
env["LIBRARY_PATH"] = os.path.pathsep.join([os.path.join(kudu_client_dir, 'lib'),
os.path.join(kudu_client_dir, 'lib64')])
exec_pip_install(["-r", KUDU_REQS_PATH], cc=cc, env=env)
mark_reqs_installed(KUDU_REQS_PATH)
exec_pip_install(venv_dir, is_py3, ["-r", KUDU_REQS_PATH], cc=cc, env=env)
mark_reqs_installed(venv_dir, KUDU_REQS_PATH)
finally:
try:
shutil.rmtree(fake_kudu_build_dir)
@@ -353,17 +404,17 @@ def error_if_kudu_client_not_found(install_dir):
raise Exception("%s not found at %s" % (kudu_client_lib, lib_dir))
def mark_reqs_installed(reqs_path):
def mark_reqs_installed(venv_dir, reqs_path):
'''Mark that the requirements from the given file are installed by copying it into
the root directory of the virtualenv.'''
installed_reqs_path = os.path.join(ENV_DIR, os.path.basename(reqs_path))
installed_reqs_path = os.path.join(venv_dir, os.path.basename(reqs_path))
shutil.copyfile(reqs_path, installed_reqs_path)
def reqs_are_installed(reqs_path):
def reqs_are_installed(venv_dir, reqs_path):
'''Check if the requirements from the given file are installed in the virtualenv by
looking for a matching requirements file in the root directory of the virtualenv.'''
installed_reqs_path = os.path.join(ENV_DIR, os.path.basename(reqs_path))
installed_reqs_path = os.path.join(venv_dir, os.path.basename(reqs_path))
if not os.path.exists(installed_reqs_path):
return False
installed_reqs_file = open(installed_reqs_path)
@@ -381,11 +432,11 @@ def reqs_are_installed(reqs_path):
installed_reqs_file.close()
def setup_virtualenv_if_not_exists():
if not (reqs_are_installed(REQS_PATH)):
delete_virtualenv_if_exist()
create_virtualenv()
install_deps()
def setup_virtualenv_if_not_exists(venv_dir, is_py3):
if not (reqs_are_installed(venv_dir, REQS_PATH)):
delete_virtualenv_if_exist(venv_dir)
create_virtualenv(venv_dir, is_py3)
install_deps(venv_dir, is_py3)
LOG.debug("Virtualenv setup complete")
@@ -397,6 +448,8 @@ if __name__ == "__main__":
" the virtualenv even if it exists and appears to be completely up-to-date.")
parser.add_option("--print-ld-library-path", action="store_true", help="Print the"
" LD_LIBRARY_PATH that should be used when running python from the virtualenv.")
parser.add_option("--python3", action="store_true", help="Generate the python3"
" virtualenv")
options, args = parser.parse_args()
if options.print_ld_library_path:
@@ -411,10 +464,17 @@ if __name__ == "__main__":
sys.exit()
logging.basicConfig(level=getattr(logging, options.log_level))
if options.python3:
venv_dir = ENV_DIR_PY3
else:
venv_dir = ENV_DIR_PY2
if options.rebuild:
delete_virtualenv_if_exist()
delete_virtualenv_if_exist(venv_dir)
# Complete as many bootstrap steps as possible (see file comment for the steps).
setup_virtualenv_if_not_exists()
install_kudu_client_if_possible()
install_adls_deps()
setup_virtualenv_if_not_exists(venv_dir, options.python3)
install_kudu_client_if_possible(venv_dir, options.python3)
install_adls_deps(venv_dir, options.python3)
install_py_version_deps(venv_dir, options.python3)

View File

@@ -38,7 +38,8 @@ PYPI_MIRROR = os.environ.get('PYPI_MIRROR', 'https://pypi.python.org')
# The requirement files that list all of the required packages and versions.
REQUIREMENTS_FILES = ['requirements.txt', 'setuptools-requirements.txt',
'kudu-requirements.txt', 'adls-requirements.txt']
'kudu-requirements.txt', 'adls-requirements.txt',
'py2-requirements.txt', 'py3-requirements.txt']
def check_digest(filename, algorithm, expected_digest):

View File

@@ -0,0 +1,34 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# Python2-only requirements
cm-api == 10.0.0
# Already available as part of python on Linux.
readline == 6.2.4.1; sys_platform == 'darwin'
flake8 == 3.9.2
mccabe == 0.6.1
pycodestyle == 2.7.0
pyflakes == 2.3.1
enum34 == 1.1.10
typing == 3.10.0.0
configparser == 4.0.2
functools32 == 3.2.3-2
importlib-metadata == 2.1.3
contextlib2 == 0.6.0
pathlib2 == 2.3.7.post1
zipp == 1.2.0

View File

@@ -0,0 +1,31 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# Python3-only requirements
pylint == 2.10.2
astroid == 2.7.3
lazy-object-proxy == 1.6.0
wrapt == 1.12.1
typed-ast == 1.4.3
configparser == 4.0.2
isort == 4.3.21
futures == 3.3.0; python_version == "2.7"
singledispatch == 3.6.1
toml == 0.10.2
platformdirs == 2.4.1
typing-extensions == 3.10.0.2

View File

@@ -20,23 +20,8 @@
# Dependents are indented. Dependents that have multiple parents are not listed
# multiple times (though maybe they could be).
allpairs == 2.0.1
allpairspy == 2.5.0
argparse == 1.4.0
cm-api == 10.0.0
# Already available as part of python on Linux.
readline == 6.2.4.1; sys_platform == 'darwin'
flake8 == 3.9.2
mccabe == 0.6.1
pycodestyle == 2.7.0
pyflakes == 2.3.1
enum34 == 1.1.10
typing == 3.10.0.0
configparser == 4.0.2
functools32 == 3.2.3-2
importlib-metadata == 2.1.3
contextlib2 == 0.6.0
pathlib2 == 2.3.7.post1
zipp == 1.2.0
future == 0.18.3
gcovr == 4.2
Jinja2 == 2.11.3
@@ -61,14 +46,13 @@ pyparsing == 2.0.3
pytest == 2.9.2
py == 1.4.32
pytest-forked == 0.2
pytest-random == 0.02
pytest-runner == 4.2
pytest-xdist == 1.17.1
pytest-timeout == 1.2.1
python-magic == 0.4.11
pywebhdfs == 0.3.2
pbr == 3.1.1
requests == 2.20.0
requests == 2.21.0
chardet == 3.0.4
idna == 2.8
urllib3 == 1.24.2

View File

@@ -18,4 +18,4 @@
# Newer versions of setuptools don't support Python 2.7
setuptools == 44.1.1
wheel == 0.35.1
setuptools-scm == 4.1.2
setuptools-scm == 5.0.2