Files
impala/infra/python/bootstrap_virtualenv.py
Michael Ho 5f3dfdf6c7 IMPALA-1619: Support 64-bit allocations.
This change extends MemPool, FreePool and StringBuffer to support
64-bit allocations, fixes a bug in decompressor and extends various
places in the code to support 64-bit allocation sizes. With this
change, the text scanner can now decompress compressed files larger
than 1GB.

Note that the UDF interfaces FunctionContext::Allocate() and
FunctionContext::Reallocate() still use 32-bit for the input
argument to avoid breaking compatibility.

In addition, the byte size of a tuple is still assumed to be
within 32-bit. If it needs to be upgraded to 64-bit, it will be
done in a separate change.

Change-Id: I7ed28083d809a86d801a9c063a0aa32c50d32b20
Reviewed-on: http://gerrit.cloudera.org:8080/2781
Reviewed-by: Dan Hecht <dhecht@cloudera.com>
Tested-by: Internal Jenkins
2016-07-05 13:37:25 -07:00

294 lines
11 KiB
Python

# Copyright (c) 2015 Cloudera, Inc. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This module will create a python virtual env and install external dependencies. If
# the virtualenv already exists and the list of dependencies matches the list of
# installed dependencies, nothing will be done.
#
# This module can be run with python >= 2.4 but python >= 2.6 must be installed on the
# system. If the default 'python' command refers to < 2.6, python 2.6 will be used
# instead.
import glob
import logging
import optparse
import os
import shutil
import subprocess
import sys
import tarfile
import tempfile
import textwrap
import urllib
LOG = logging.getLogger(os.path.splitext(os.path.basename(__file__))[0])
DEPS_DIR = os.path.join(os.path.dirname(__file__), "deps")
ENV_DIR = os.path.join(os.path.dirname(__file__), "env")
# Generated using "pip install --download <DIR> -r requirements.txt"
REQS_PATH = os.path.join(DEPS_DIR, "requirements.txt")
# After installing, the requirements.txt will be copied into the virtualenv to
# record what was installed.
INSTALLED_REQS_PATH = os.path.join(ENV_DIR, "installed-requirements.txt")
def delete_virtualenv_if_exist():
if os.path.exists(ENV_DIR):
shutil.rmtree(ENV_DIR)
def create_virtualenv():
LOG.info("Creating python virtualenv")
build_dir = tempfile.mkdtemp()
file = tarfile.open(find_file(DEPS_DIR, "virtualenv*.tar.gz"), "r:gz")
for member in file.getmembers():
file.extract(member, build_dir)
file.close()
python_cmd = detect_python_cmd()
exec_cmd([python_cmd, find_file(build_dir, "virtualenv*", "virtualenv.py"), "--quiet",
"--python", python_cmd, ENV_DIR])
shutil.rmtree(build_dir)
def exec_cmd(args, **kwargs):
'''Executes a command and waits for it to finish, raises an exception if the return
status is not zero. The command output is returned.
'args' and 'kwargs' use the same format as subprocess.Popen().
'''
process = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
**kwargs)
output = process.communicate()[0]
if process.returncode != 0:
raise Exception("Command returned non-zero status\nCommand: %s\nOutput: %s"
% (args, output))
return output
def exec_pip_install(args, **popen_kwargs):
# Don't call the virtualenv pip directly, it uses a hashbang to to call the python
# virtualenv using an absolute path. If the path to the virtualenv is very long, the
# hashbang won't work.
#
# Passes --no-binary for IMPALA-3767: without this, Cython (and
# several other packages) fail download.
exec_cmd([os.path.join(ENV_DIR, "bin", "python"), os.path.join(ENV_DIR, "bin", "pip"),
"install", "--no-binary", "--no-index", "--find-links",
"file://%s" % urllib.pathname2url(os.path.abspath(DEPS_DIR))] + args, **popen_kwargs)
def find_file(*paths):
'''Returns the path specified by the glob 'paths', raises an exception if no file is
found.
Ex: find_file('/etc', 'h*sts') --> /etc/hosts
'''
path = os.path.join(*paths)
files = glob.glob(path)
if len(files) > 1:
raise Exception("Found too many files at %s: %s" % (path, files))
if len(files) == 0:
raise Exception("No file found at %s" % path)
return files[0]
def detect_python_cmd():
'''Returns the system command that provides python 2.6 or greater.'''
paths = os.getenv("PATH").split(os.path.pathsep)
for cmd in ("python", "python27", "python2.7", "python-27", "python-2.7", "python26",
"python2.6", "python-26", "python-2.6"):
for path in paths:
cmd_path = os.path.join(path, cmd)
if not os.path.exists(cmd_path) or not os.access(cmd_path, os.X_OK):
continue
exit = subprocess.call([cmd_path, "-c", textwrap.dedent("""
import sys
sys.exit(int(sys.version_info[:2] < (2, 6)))""")])
if exit == 0:
return cmd_path
raise Exception("Could not find minimum required python version 2.6")
def install_deps():
toolchain_dir = os.environ.get("IMPALA_TOOLCHAIN", "")
snappy_version = os.environ.get("IMPALA_SNAPPY_VERSION", "")
snappy_dir = toolchain_dir + "/snappy-" + snappy_version
lib_path = snappy_dir + "/lib:" + os.environ.get("LD_LIBRARY_PATH", "")
include_dir = snappy_dir + "/include"
args = ["--global-option", "build_ext", "--global-option", "-L"+ lib_path,
"--global-option", "-I" + include_dir, "-r", REQS_PATH]
LOG.info("Installing packages into the virtualenv")
exec_pip_install(args)
shutil.copyfile(REQS_PATH, INSTALLED_REQS_PATH)
def install_kudu_client_if_possible():
"""Installs the Kudu python module if possible. The Kudu module is the only one that
requires the toolchain. If the toolchain isn't in use or hasn't been populated
yet, nothing will be done. Also nothing will be done if the Kudu client lib required
by the module isn't available (as determined by KUDU_IS_SUPPORTED).
"""
if os.environ["KUDU_IS_SUPPORTED"] != "true":
LOG.debug("Skipping Kudu: Kudu is not supported")
return
impala_toolchain_dir = os.environ.get("IMPALA_TOOLCHAIN")
if not impala_toolchain_dir:
LOG.debug("Skipping Kudu: IMPALA_TOOLCHAIN not set")
return
toolchain_kudu_dir = os.path.join(
impala_toolchain_dir, "kudu-" + os.environ["IMPALA_KUDU_VERSION"])
if not os.path.exists(toolchain_kudu_dir):
LOG.debug("Skipping Kudu: %s doesn't exist" % toolchain_kudu_dir)
return
# The "pip" command could be used to provide the version of Kudu installed (if any)
# but it's a little too slow. Running the virtualenv python to detect the installed
# version is faster.
actual_version_string = exec_cmd([os.path.join(ENV_DIR, "bin", "python"), "-c",
textwrap.dedent("""
try:
import kudu
print kudu.__version__
except ImportError:
pass""")]).strip()
actual_version = [int(v) for v in actual_version_string.split(".") if v]
reqs_file = open(REQS_PATH)
try:
for line in reqs_file:
if not line.startswith("# kudu-python=="):
continue
expected_version_string = line.split()[1].split("==")[1]
break
else:
raise Exception("Unable to find kudu-python version in requirements file")
finally:
reqs_file.close()
expected_version = [int(v) for v in expected_version_string.split(".")]
if actual_version and actual_version == expected_version:
LOG.debug("Skipping Kudu: Installed %s == required %s"
% (actual_version_string, expected_version_string))
return
LOG.debug("Kudu installation required. Actual version %s. Required version %s.",
actual_version, expected_version)
LOG.info("Installing Kudu into the virtualenv")
# The installation requires that KUDU_HOME/build/latest exists. An empty directory
# structure will be made to satisfy that. The Kudu client headers and lib will be made
# available through GCC environment variables.
fake_kudu_build_dir = os.path.join(tempfile.gettempdir(), "virtualenv-kudu")
try:
artifact_dir = os.path.join(fake_kudu_build_dir, "build", "latest")
if not os.path.exists(artifact_dir):
os.makedirs(artifact_dir)
env = dict(os.environ)
env["KUDU_HOME"] = fake_kudu_build_dir
kudu_client_dir = find_kudu_client_install_dir()
env["CPLUS_INCLUDE_PATH"] = os.path.join(kudu_client_dir, "include")
env["LIBRARY_PATH"] = os.path.pathsep.join([os.path.join(kudu_client_dir, 'lib'),
os.path.join(kudu_client_dir, 'lib64')])
exec_pip_install(["kudu-python==" + expected_version_string], env=env)
finally:
try:
shutil.rmtree(fake_kudu_build_dir)
except Exception:
LOG.debug("Error removing temp Kudu build dir", exc_info=True)
def find_kudu_client_install_dir():
custom_client_dir = os.environ["KUDU_CLIENT_DIR"]
if custom_client_dir:
install_dir = os.path.join(custom_client_dir, "usr", "local")
error_if_kudu_client_not_found(install_dir)
else:
# If the toolchain appears to have been setup already, then the Kudu client is
# required to exist. It's possible that the toolchain won't be setup yet though
# since the toolchain bootstrap script depends on the virtualenv.
kudu_base_dir = os.path.join(os.environ["IMPALA_TOOLCHAIN"],
"kudu-%s" % os.environ["IMPALA_KUDU_VERSION"])
install_dir = os.path.join(kudu_base_dir, "debug")
if os.path.exists(kudu_base_dir):
error_if_kudu_client_not_found(install_dir)
return install_dir
def error_if_kudu_client_not_found(install_dir):
header_path = os.path.join(install_dir, "include", "kudu", "client", "client.h")
if not os.path.exists(header_path):
raise Exception("Kudu client header not found at %s" % header_path)
kudu_client_lib = "libkudu_client.so"
lib_dir = os.path.join(install_dir, "lib64")
if not os.path.exists(lib_dir):
lib_dir = os.path.join(install_dir, "lib")
for _, _, files in os.walk(lib_dir):
for file in files:
if file == kudu_client_lib:
return
raise Exception("%s not found at %s" % (kudu_client_lib, lib_dir))
def deps_are_installed():
if not os.path.exists(INSTALLED_REQS_PATH):
return False
installed_reqs_file = open(INSTALLED_REQS_PATH)
try:
reqs_file = open(REQS_PATH)
try:
if reqs_file.read() == installed_reqs_file.read():
return True
else:
LOG.info("Virtualenv upgrade needed")
return False
finally:
reqs_file.close()
finally:
installed_reqs_file.close()
def setup_virtualenv_if_not_exists():
if not deps_are_installed():
delete_virtualenv_if_exist()
create_virtualenv()
install_deps()
LOG.info("Virtualenv setup complete")
if __name__ == "__main__":
parser = optparse.OptionParser()
parser.add_option("-l", "--log-level", default="INFO",
choices=("DEBUG", "INFO", "WARN", "ERROR"))
parser.add_option("-r", "--rebuild", action="store_true", help="Force a rebuild of"
" the virtualenv even if it exists and appears to be completely up-to-date.")
parser.add_option("--print-ld-library-path", action="store_true", help="Print the"
" LD_LIBRARY_PATH that should be used when running python from the virtualenv.")
options, args = parser.parse_args()
if options.print_ld_library_path:
kudu_client_dir = find_kudu_client_install_dir()
print os.path.pathsep.join([os.path.join(kudu_client_dir, 'lib'),
os.path.join(kudu_client_dir, 'lib64')])
sys.exit()
logging.basicConfig(level=getattr(logging, options.log_level))
if options.rebuild:
delete_virtualenv_if_exist()
setup_virtualenv_if_not_exists()
install_kudu_client_if_possible()