IMPALA-9626: Use Python from the toolchain for Impala

Historically Impala used the Python2 version that was available on
the hosting platform, as long as that version was at least v2.6.
This caused constant headache as all Python syntax had to be kept
compatible with Python 2.6 (for Centos 6). It also caused a recent problem
on Centos 8: here the system Python version was compiled with the
system's GCC version (v8.3), which was much more recent than the Impala
standard compiler version (GCC 4.9.2). When the Impala virtualenv was
built, the system Python version supplied C compiler switches for models
containing native code that were unknown for the Impala version of GCC,
thus breaking virtualenv installation.

This patch changes the Impala virtualenv to always use the Python2
version from the toolchain, which is built with the toolchain compiler.

This ensures that
- Impala always has a known Python 2.7 version for all its scripts,
- virtualenv modules based on native code will always be installable, as
  the Python environment and the modules are built with the same compiler
  version.

Additional changes:
- Add an auto-use fixture to conftest.py to check that the tests are
  being run with Python 2.7.x
- Make bootstrap_toolchain.py independent from the Impala virtualenv:
  remove the dependency on the "sh" library

Tests:
- Passed core-mode tests on CentOS 7.4
- Passed core-mode tests in Docker-based mode for centos:7
  and ubuntu:16.04

Most content in this patch was developed but not published earlier
by Tim Armstrong.

Change-Id: Ic7b40cef89cfb3b467b61b2d54a94e708642882b
Reviewed-on: http://gerrit.cloudera.org:8080/15624
Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
This commit is contained in:
Laszlo Gaal
2020-03-18 18:03:24 +01:00
committed by Impala Public Jenkins
parent 21aa514353
commit c97191b6a5
5 changed files with 67 additions and 30 deletions

View File

@@ -1,4 +1,4 @@
#!/usr/bin/env impala-python
#!/usr/bin/env python
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
@@ -58,18 +58,12 @@
#
# The script is directly executable, and it takes no parameters:
# ./bootstrap_toolchain.py
# It should NOT be run via 'python bootstrap_toolchain.py', as it relies on a specific
# python environment.
import logging
import glob
import multiprocessing.pool
import os
import random
import re
# TODO: This file should be runnable without using impala-python, and system python
# does not have 'sh' available. Rework code to avoid importing sh (and anything else
# that gets in the way).
import sh
import shutil
import subprocess
import sys
@@ -107,6 +101,26 @@ OS_MAPPING = [
]
def check_output(cmd_args):
"""Run the command and return the output. Raise an exception if the command returns
a non-zero return code. Similar to subprocess.check_output() which is only provided
in python 2.7.
"""
process = subprocess.Popen(cmd_args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
stdout, _ = process.communicate()
if process.wait() != 0:
raise Exception("Command with args '%s' failed with exit code %s:\n%s"
% (cmd_args, process.returncode, stdout))
return stdout
def get_toolchain_compiler():
"""Return the <name>-<version> string for the compiler package to use for the
toolchain."""
# Currently we always use GCC.
return "gcc-{0}".format(os.environ["IMPALA_GCC_VERSION"])
def wget_and_unpack_package(download_path, file_name, destination, wget_no_clobber):
if not download_path.endswith("/" + file_name):
raise Exception("URL {0} does not match with expected file_name {1}"
@@ -117,7 +131,10 @@ def wget_and_unpack_package(download_path, file_name, destination, wget_no_clobb
download_path, destination, file_name, attempt))
# --no-clobber avoids downloading the file if a file with the name already exists
try:
sh.wget(download_path, directory_prefix=destination, no_clobber=wget_no_clobber)
cmd = ["wget", download_path, "--directory-prefix={0}".format(destination)]
if wget_no_clobber:
cmd.append("--no-clobber")
check_output(cmd)
break
except Exception, e:
if attempt == NUM_ATTEMPTS:
@@ -125,8 +142,9 @@ def wget_and_unpack_package(download_path, file_name, destination, wget_no_clobb
logging.error("Download failed; retrying after sleep: " + str(e))
time.sleep(10 + random.random() * 5) # Sleep between 10 and 15 seconds.
logging.info("Extracting {0}".format(file_name))
sh.tar(z=True, x=True, f=os.path.join(destination, file_name), directory=destination)
sh.rm(os.path.join(destination, file_name))
check_output(["tar", "xzf", os.path.join(destination, file_name),
"--directory={0}".format(destination)])
os.unlink(os.path.join(destination, file_name))
class DownloadUnpackTarball(object):
@@ -241,7 +259,7 @@ class ToolchainPackage(EnvVersionedPackage):
logging.error("Impala environment not set up correctly, make sure "
"$IMPALA_TOOLCHAIN is set.")
sys.exit(1)
compiler = "gcc-{0}".format(os.environ["IMPALA_GCC_VERSION"])
compiler = get_toolchain_compiler()
label = get_platform_release_label(release=platform_release).toolchain
toolchain_build_id = os.environ["IMPALA_TOOLCHAIN_BUILD_ID"]
toolchain_host = os.environ["IMPALA_TOOLCHAIN_HOST"]
@@ -409,7 +427,8 @@ def get_platform_release_label(release=None):
if lsb_release_cache:
release = lsb_release_cache
else:
release = "".join(map(lambda x: x.lower(), sh.lsb_release("-irs").split()))
lsb_release = check_output(["lsb_release", "-irs"])
release = "".join(map(lambda x: x.lower(), lsb_release.split()))
# Only need to check against the major release if RHEL or CentOS
for platform in ['centos', 'redhatenterpriseserver']:
if platform in release:
@@ -419,7 +438,6 @@ def get_platform_release_label(release=None):
for mapping in OS_MAPPING:
if re.search(mapping.lsb_release, release):
return mapping
raise Exception("Could not find package label for OS version: {0}.".format(release))

View File

@@ -135,6 +135,8 @@ export IMPALA_PROTOBUF_VERSION=3.5.1
unset IMPALA_PROTOBUF_URL
export IMPALA_POSTGRES_JDBC_DRIVER_VERSION=42.2.5
unset IMPALA_POSTGRES_JDBC_DRIVER_URL
export IMPALA_PYTHON_VERSION=2.7.16
unset IMPALA_PYTHON_URL
export IMPALA_RAPIDJSON_VERSION=1.1.0
unset IMPALA_RAPIDJSON_URL
export IMPALA_RE2_VERSION=20190301

View File

@@ -22,7 +22,9 @@
# Setting USE_THRIFT11_GEN_PY will add Thrift 11 Python generated code rather than the
# default Thrift Python code.
# Used to allow importing testdata, test, etc modules from other scripts.
export PYTHONPATH=${IMPALA_HOME}
# ${IMPALA_HOME}/bin has bootstrap_toolchain.py, required by bootstrap_virtualenv.py
export PYTHONPATH=${IMPALA_HOME}:${IMPALA_HOME}/bin
# Generated Thrift files are used by tests and other scripts.
if [ -n "${USE_THRIFT11_GEN_PY:-}" ]; then
@@ -31,6 +33,8 @@ else
PYTHONPATH=${PYTHONPATH}:${IMPALA_HOME}/shell/gen-py
fi
PYTHONPATH=${PYTHONPATH}:${IMPALA_HOME}/infra/python/env/lib
# There should be just a single version of python that created the
# site-packages directory. We find it by performing shell independent expansion
# of the following pattern:

View File

@@ -46,6 +46,7 @@ import tarfile
import tempfile
import textwrap
import urllib
from bootstrap_toolchain import ToolchainPackage
LOG = logging.getLogger(os.path.splitext(os.path.basename(__file__))[0])
@@ -83,7 +84,7 @@ def create_virtualenv():
for member in file.getmembers():
file.extract(member, build_dir)
file.close()
python_cmd = detect_python_cmd()
python_cmd = download_toolchain_python()
exec_cmd([python_cmd, find_file(build_dir, "virtualenv*", "virtualenv.py"), "--quiet",
"--python", python_cmd, ENV_DIR])
shutil.rmtree(build_dir)
@@ -189,21 +190,23 @@ def find_file(*paths):
return files[0]
def detect_python_cmd():
'''Returns the system command that provides python 2.6 or greater.'''
paths = os.getenv("PATH").split(os.path.pathsep)
for cmd in ("python", "python27", "python2.7", "python-27", "python-2.7", "python26",
"python2.6", "python-26", "python-2.6"):
for path in paths:
cmd_path = os.path.join(path, cmd)
if not os.path.exists(cmd_path) or not os.access(cmd_path, os.X_OK):
continue
exit = subprocess.call([cmd_path, "-c", textwrap.dedent("""
import sys
sys.exit(int(sys.version_info[:2] < (2, 6)))""")])
if exit == 0:
return cmd_path
raise Exception("Could not find minimum required python version 2.6")
def download_toolchain_python():
'''Grabs the Python implementation from the Impala toolchain, using the machinery from
bin/bootstrap_toolchain.py
'''
toolchain_root = os.environ.get("IMPALA_TOOLCHAIN")
if not toolchain_root:
raise Exception(
"Impala environment not set up correctly, make sure $IMPALA_TOOLCHAIN is set.")
package = ToolchainPackage("python")
package.download()
python_cmd = os.path.join(package.pkg_directory(), "bin/python")
if not os.path.exists(python_cmd):
raise Exception("Unexpected error bootstrapping python from toolchain: {0} does not "
"exist".format(python_cmd))
return python_cmd
def install_deps():

View File

@@ -26,6 +26,7 @@ import contextlib
import logging
import os
import pytest
import sys
import tests.common
from impala_py_lib.helpers import find_all_files, is_core_dump
@@ -609,6 +610,15 @@ def cluster_properties():
yield cluster_properties
@pytest.fixture(autouse=True, scope='session')
def validate_python_version():
"""Check the Python runtime version before running any tests. Since Impala switched
to the toolchain Python, which is at least v2.7, the tests will not run on a version
below that.
"""
assert sys.version_info > (2, 7), "Tests only support Python 2.7+"
@pytest.hookimpl(trylast=True)
def pytest_collection_modifyitems(items, config, session):
"""Hook to handle --shard_tests command line option.