#!/bin/bash # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. # This script bootstraps a system for Impala development from almost nothing; it is known # to work on Ubuntu 16.04. It clobbers some local environment and system # configurations, so it is best to run this in a fresh install. It also sets up the # ~/.bashrc for the calling user and impala-config-local.sh with some environment # variables to make Impala compile and run after this script is complete. # When IMPALA_HOME is set, the script will bootstrap Impala development in the # location specified. # # The intended user is a person who wants to start contributing code to Impala. This # script serves as an executable reference point for how to get started. # # To run this in a Docker container: # # 1. Run with --privileged # 2. Give the container a non-root sudoer wih NOPASSWD: # apt-get update # apt-get install sudo # adduser --disabled-password --gecos '' impdev # echo 'impdev ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers # 3. Run this script as that user: su - impdev -c /bootstrap_development.sh # # This script has some specializations for CentOS/Redhat 6/7 and Ubuntu. # Of note, inside of Docker, Redhat 7 doesn't allow you to start daemons # with systemctl, so sshd and postgresql are started manually in those cases. set -eu -o pipefail : ${IMPALA_HOME:=$(cd "$(dirname $0)"/..; pwd)} export IMPALA_HOME if [[ -t 1 ]] # if on an interactive terminal then echo "This script will clobber some system settings. Are you sure you want to" echo -n "continue? " while true do read -p "[yes/no] " ANSWER ANSWER=$(echo "$ANSWER" | tr /a-z/ /A-Z/) if [[ $ANSWER = YES ]] then break elif [[ $ANSWER = NO ]] then echo "OK, Bye!" exit 1 fi done fi set -x # Determine whether we're running on redhat or ubuntu REDHAT= REDHAT7= REDHAT8= REDHAT9= UBUNTU= UBUNTU16= UBUNTU18= UBUNTU20= UBUNTU22= UBUNTU24= IN_DOCKER= if [[ -f /etc/redhat-release ]]; then REDHAT=true echo "Identified redhat system." if grep 'release 9\.' /etc/redhat-release; then REDHAT9=true echo "Identified redhat9 system." fi if grep 'release 8\.' /etc/redhat-release; then REDHAT8=true echo "Identified redhat8 system." fi if grep 'release 7\.' /etc/redhat-release; then REDHAT7=true echo "Identified redhat7 system." fi # TODO: restrict redhat versions else source /etc/lsb-release if [[ $DISTRIB_ID = Ubuntu ]] then UBUNTU=true echo "Identified Ubuntu system." # Kerberos setup would pop up dialog boxes without this export DEBIAN_FRONTEND=noninteractive if [[ $DISTRIB_RELEASE = 16.04 ]] then UBUNTU16=true echo "Identified Ubuntu 16.04 system." elif [[ $DISTRIB_RELEASE = 18.04 ]] then UBUNTU18=true echo "Identified Ubuntu 18.04 system." elif [[ $DISTRIB_RELEASE = 20.04 ]] then UBUNTU20=true echo "Identified Ubuntu 20.04 system." elif [[ $DISTRIB_RELEASE = 22.04 ]] then UBUNTU22=true echo "Identified Ubuntu 22.04 system." elif [[ $DISTRIB_RELEASE = 24.04 ]] then UBUNTU24=true echo "Identified Ubuntu 24.04 system." else echo "This script supports Ubuntu versions 16.04, 18.04, 20.04, 22.04, or 24.04" >&2 exit 1 fi else echo "This script only supports Ubuntu or RedHat" >&2 exit 1 fi fi if grep docker /proc/1/cgroup; then IN_DOCKER=true echo "Identified we are running inside of Docker." fi # Helper function to execute following command only on Ubuntu function ubuntu { if [[ "$UBUNTU" == true ]]; then "$@" fi } # Helper function to execute following command only on Ubuntu 16.04 function ubuntu16 { if [[ "$UBUNTU16" == true ]]; then "$@" fi } # Helper function to execute following command only on Ubuntu 18.04 function ubuntu18 { if [[ "$UBUNTU18" == true ]]; then "$@" fi } function ubuntu20 { if [[ "$UBUNTU20" == true ]]; then "$@" fi } function ubuntu22 { if [[ "$UBUNTU22" == true ]]; then "$@" fi } function ubuntu24 { if [[ "$UBUNTU24" == true ]]; then "$@" fi } # Helper function to execute following command only on RedHat function redhat { if [[ "$REDHAT" == true ]]; then "$@" fi } # Helper function to execute following command only on RedHat7 function redhat7 { if [[ "$REDHAT7" == true ]]; then "$@" fi } # Helper function to execute following command only on RedHat8 function redhat8 { if [[ "$REDHAT8" == true ]]; then "$@" fi } # Helper function to execute following command only on RedHat8 function redhat9 { if [[ "$REDHAT9" == true ]]; then "$@" fi } # Helper function to execute following command only in docker function indocker { if [[ "$IN_DOCKER" == true ]]; then "$@" fi } # Helper function to execute following command only outside of docker function notindocker { if [[ "$IN_DOCKER" != true ]]; then "$@" fi } # X permission on home directory is needed for some uses of postgresql (IMPALA-13693) chmod o+X ~ # Note that yum has its own retries; see yum.conf(5). REAL_APT_GET=$(ubuntu which apt-get) function apt-get { for ITER in $(seq 1 30); do echo "ATTEMPT: ${ITER}" if sudo -E "${REAL_APT_GET}" "$@" then return 0 fi sleep "${ITER}" done echo "NO MORE RETRIES" return 1 } echo ">>> Installing build tools" if [[ "$UBUNTU" == true ]]; then while sudo fuser /var/lib/dpkg/lock-frontend; do sleep 1 done fi # Set UBUNTU_JAVA_VERSION, UBUNTU_PACKAGE_ARCH, REDHAT_JAVA_VERSION source "$IMPALA_HOME/bin/impala-config-java.sh" ubuntu apt-get update ubuntu apt-get --yes install ccache curl file gawk g++ gcc apt-utils git libffi-dev \ libkrb5-dev krb5-admin-server krb5-kdc krb5-user libsasl2-dev \ libsasl2-modules libsasl2-modules-gssapi-mit libssl-dev make ninja-build \ python3-dev python3-setuptools python3-venv postgresql \ ssh wget vim-common psmisc lsof net-tools language-pack-en libxml2-dev \ libxslt-dev openjdk-${UBUNTU_JAVA_VERSION}-jdk \ openjdk-${UBUNTU_JAVA_VERSION}-source openjdk-${UBUNTU_JAVA_VERSION}-dbg # Regular python packages don't exist on Ubuntu 22. Everything is Python 3. ubuntu16 apt-get --yes install python python-dev python-setuptools ubuntu18 apt-get --yes install python python-dev python-setuptools ubuntu20 apt-get --yes install python python-dev python-setuptools # Ubuntu 20's Python 2.7.18-1~20.04.5 version has a bug in its tarfile support. # If we detect the affected tarfile.py, download a patched version and overwrite it. if [[ $UBUNTU20 == true ]]; then if [[ -f /usr/lib/python2.7/tarfile.py ]]; then TARFILE_PY_HASH=$(sha1sum /usr/lib/python2.7/tarfile.py | cut -d' ' -f1) if [[ "${TARFILE_PY_HASH}" == "6e1a6d9ea2a535cbb17fe266ed9ac76eb5e27b89" ]]; then TMP_DIR=$(mktemp -d) pushd $TMP_DIR wget -nv https://launchpadlibrarian.net/759546541/tarfile.py sudo cp tarfile.py /usr/lib/python2.7/tarfile.py popd rm -rf $TMP_DIR fi fi fi # Required by Kudu in the minicluster. Older Kudu versions depend on libtinfo5, # versions that can be compiled for Ubuntu 24.04 depend on libtinfo6. ubuntu20 apt-get --yes install libtinfo5 libtinfo6 ubuntu22 apt-get --yes install libtinfo5 libtinfo6 ubuntu24 apt-get --yes install libtinfo6 ARCH_NAME=$(uname -p) if [[ $ARCH_NAME == 'aarch64' ]]; then ubuntu apt-get --yes install unzip pkg-config flex maven python3-pip build-essential \ texinfo bison autoconf automake libtool libz-dev libncurses-dev \ libncurses5-dev libreadline-dev fi ubuntu sudo update-java-alternatives -l || true # Configure the default Java version to be the version we selected. ubuntu sudo update-java-alternatives -v -s \ java-1.${UBUNTU_JAVA_VERSION}.0-openjdk-${UBUNTU_PACKAGE_ARCH} redhat sudo yum install -y file gawk gcc gcc-c++ git krb5-devel krb5-server \ krb5-workstation libevent-devel libffi-devel make openssl-devel cyrus-sasl \ cyrus-sasl-gssapi cyrus-sasl-devel cyrus-sasl-plain \ postgresql postgresql-server rpm-build \ wget vim-common nscd cmake zlib-devel \ procps psmisc lsof openssh-server python3-devel python3-setuptools \ net-tools langpacks-en glibc-langpack-en libxml2-devel libxslt-devel \ java-${REDHAT_JAVA_VERSION}-openjdk-src java-${REDHAT_JAVA_VERSION}-openjdk-devel redhat sudo alternatives --set java java-${REDHAT_JAVA_VERSION}-openjdk.${ARCH_NAME} redhat sudo alternatives --set javac java-${REDHAT_JAVA_VERSION}-openjdk.${ARCH_NAME} redhat sudo alternatives --set java_sdk_openjdk java-${REDHAT_JAVA_VERSION}-openjdk.${ARCH_NAME} redhat sudo alternatives --set jre_openjdk java-${REDHAT_JAVA_VERSION}-openjdk.${ARCH_NAME} # update-java-alternatives may not take effect if there is a Java in PATH which java java -version which javac javac -version # fuse-devel doesn't exist for Redhat 9 redhat7 sudo yum install -y fuse-devel curl redhat8 sudo yum install -y fuse-devel curl # Redhat9 can have curl-minimal preinstalled, which can conflict with curl. # Adding --allowerasing allows curl to replace curl-minimal. redhat9 sudo yum install -y --allowerasing curl # RedHat / CentOS 8 exposes only specific versions of Python. # Set up unversioned default Python 2.x for older CentOS versions redhat7 sudo yum install -y python-devel python-setuptools python-argparse # Install Python 2.x explicitly for CentOS 8 function setup_python2() { if command -v python && [[ $(python --version 2>&1 | cut -d ' ' -f 2) =~ 2\. ]]; then echo "We have Python 2.x"; else if ! command -v python2; then # Python2 needs to be installed sudo dnf install -y python2 fi # Here Python2 is installed, but is not the default Python. # 1. Link pip's version to Python's version sudo alternatives --add-slave python /usr/bin/python2 /usr/bin/pip pip /usr/bin/pip2 sudo alternatives --add-slave python /usr/libexec/no-python /usr/bin/pip pip \ /usr/libexec/no-python # 2. Set Python2 (with pip2) to be the system default. sudo alternatives --set python /usr/bin/python2 fi # Here the Python2 runtime is already installed, add the dev package sudo dnf -y install python2-devel } # IMPALA-14606: Stop building using Python 2 and always run with # IMPALA_USE_PYTHON3_TESTS=true. # redhat8 setup_python2 # redhat8 pip install --user argparse # Point Python to Python 3 for Redhat 9 and Ubuntu 22, or newer function setup_python3() { # If python is already set, then use it. Otherwise, try to point python to python3. if ! command -v python > /dev/null; then if command -v python3 ; then # Newer OSes (e.g. Redhat 9 and equivalents) make it harder to get Python 2, and we # need to start using Python 3 by default. # For these new OSes (Ubuntu 22+, Redhat 9), there is no alternative entry for # python, so we need to create one from scratch. if command -v alternatives > /dev/null; then if sudo alternatives --list | grep python > /dev/null ; then sudo alternatives --set python /usr/bin/python3 else # The alternative doesn't exist, create it sudo alternatives --install /usr/bin/python python /usr/bin/python3 20 fi elif command -v update-alternatives > /dev/null; then # This is what Ubuntu 20/22+ does. There is no official python alternative, # so we need to create one. sudo update-alternatives --install /usr/bin/python python /usr/bin/python3 20 else echo "ERROR: trying to set python to point to python3" echo "ERROR: alternatives/update-alternatives don't exist, so giving up..." exit 1 fi fi fi } redhat8 setup_python3 redhat9 setup_python3 ubuntu22 setup_python3 ubuntu24 setup_python3 # CentOS repos don't contain ccache, so install from EPEL redhat sudo yum install -y epel-release redhat sudo yum install -y ccache # Clean up yum caches redhat sudo yum clean all # Download Maven since the packaged version is pretty old. : ${IMPALA_TOOLCHAIN_HOST:=native-toolchain.s3.amazonaws.com} MVN_VERSION="3.9.8" if [ ! -d "/usr/local/apache-maven-${MVN_VERSION}" ]; then sudo wget -nv \ "https://${IMPALA_TOOLCHAIN_HOST}/maven/apache-maven-${MVN_VERSION}-bin.tar.gz" sha512sum -c - <<< "7d171def9b85846bf757a2cec94b7529371068a0670df14682447224e57983528e97a6d1b850327e4ca02b139abaab7fcb93c4315119e6f0ffb3f0cbc0d0b9a2 apache-maven-${MVN_VERSION}-bin.tar.gz" sudo tar -C /usr/local -xzf "apache-maven-${MVN_VERSION}-bin.tar.gz" # Ensure that Impala's preferred version is installed locally, # even if a previous version exists there. sudo ln -s -f "/usr/local/apache-maven-${MVN_VERSION}/bin/mvn" "/usr/local/bin" # reset permissions on redhat8 # TODO: figure out why this is necessary for redhat8 MAVEN_DIRECTORY="/usr/local/apache-maven-${MVN_VERSION}" redhat8 indocker sudo chmod 0755 ${MAVEN_DIRECTORY} redhat8 indocker sudo chmod 0755 ${MAVEN_DIRECTORY}/{bin,boot} redhat9 indocker sudo chmod 0755 ${MAVEN_DIRECTORY} redhat9 indocker sudo chmod 0755 ${MAVEN_DIRECTORY}/{bin,boot} fi if ! { service --status-all | grep -E '^ \[ \+ \] ssh$'; } then ubuntu sudo service ssh start redhat notindocker sudo service sshd start redhat indocker sudo /usr/bin/ssh-keygen -A redhat indocker sudo /usr/sbin/sshd # The CentOS 8.1 image includes /var/run/nologin by mistake; this file prevents # SSH logins. See https://github.com/CentOS/sig-cloud-instance-images/issues/60 redhat8 indocker sudo rm -f /var/run/nologin fi # TODO: config ccache to give it plenty of space # TODO: check that there is enough space on disk to do a build and data load # TODO: make this work with non-bash shells echo ">>> Configuring system" function setup_postgresql() { echo ">>> Configuring postgresql. This can fail if postgres is already initialized" # initdb can fail if it was run before on this host - ignore this error redhat notindocker sudo service postgresql initdb || true redhat notindocker sudo service postgresql stop redhat indocker sudo -u postgres PGDATA=/var/lib/pgsql/data pg_ctl init ubuntu sudo service postgresql stop # These configurations expose connectiong to PostgreSQL via md5-hashed # passwords over TCP to localhost, and the local socket is trusted # widely. ubuntu sudo sed -ri 's/local +all +all +peer/local all all trust/g' \ /etc/postgresql/*/main/pg_hba.conf # Accept remote connections from the hosts in the same subnet. ubuntu sudo sed -ri "s/#listen_addresses = 'localhost'/listen_addresses = '0.0.0.0'/g" \ /etc/postgresql/*/main/postgresql.conf ubuntu sudo sed -ri 's/host +all +all +127.0.0.1\/32/host all all samenet/g' \ /etc/postgresql/*/main/pg_hba.conf redhat sudo sed -ri 's/local +all +all +(ident|peer)/local all all trust/g' \ /var/lib/pgsql/data/pg_hba.conf # Accept md5 passwords from localhost redhat sudo sed -i -e 's,\(host.*\)ident,\1md5,' /var/lib/pgsql/data/pg_hba.conf # Accept remote connections from the hosts in the same subnet. redhat sudo sed -ri "s/#listen_addresses = 'localhost'/listen_addresses = '0.0.0.0'/g" \ /var/lib/pgsql/data/postgresql.conf redhat sudo sed -ri 's/host +all +all +127.0.0.1\/32/host all all samenet/g' \ /var/lib/pgsql/data/pg_hba.conf ubuntu sudo service postgresql start redhat notindocker sudo service postgresql start # Important to redirect pg_ctl to a logfile, lest it keep the stdout # file descriptor open, preventing the shell from exiting. redhat indocker sudo -u postgres PGDATA=/var/lib/pgsql/data bash -c \ "pg_ctl start -w --timeout=120 >> /var/lib/pgsql/pg.log 2>&1" # Set up postgres for HMS if ! [[ 1 = $(sudo -u postgres psql -At -c "SELECT count(*) FROM pg_roles WHERE rolname = 'hiveuser';") ]] then sudo -u postgres psql -c "CREATE ROLE hiveuser LOGIN PASSWORD 'password';" fi sudo -u postgres psql -c "ALTER ROLE hiveuser WITH CREATEDB;" # On Ubuntu 18.04 aarch64 version, the sql 'select * from pg_roles' blocked, # because output of 'select *' is too long to display in 1 line. # So here just change it to 'select count(*)' as a work around. if [[ $ARCH_NAME == 'aarch64' ]]; then sudo -u postgres psql -c "SELECT count(*) FROM pg_roles WHERE rolname = 'hiveuser';" else sudo -u postgres psql -c "SELECT * FROM pg_roles WHERE rolname = 'hiveuser';" fi echo ">>> Configuring postgresql finished." } # Setup ssh to ssh to localhost mkdir -p ~/.ssh chmod go-rwx ~/.ssh if ! [[ -f ~/.ssh/id_rsa ]] then ssh-keygen -t rsa -N '' -q -f ~/.ssh/id_rsa fi { echo "" | cat - ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys; } && chmod 0600 ~/.ssh/authorized_keys echo -e "\nNoHostAuthenticationForLocalhost yes" >> ~/.ssh/config && chmod 0600 ~/.ssh/config ssh localhost whoami # Workarounds for HDFS networking issues: On the minicluster, tests that rely # on WebHDFS may fail with "Connection refused" errors because the namenode # will return a "Location:" redirect to the hostname, but the datanode is only # listening on localhost. See also HDFS-13797. To reproduce this, the following # snippet may be useful: # # $impala-python3 # >>> import logging # >>> logging.basicConfig(level=logging.DEBUG) # >>> logging.getLogger("requests.packages.urllib3").setLevel(logging.DEBUG) # >>> from pywebhdfs.webhdfs import PyWebHdfsClient # >>> PyWebHdfsClient(host='localhost',port='5070', user_name='hdfs').read_file( # "/test-warehouse/tpch.region/region.tbl") # INFO:...:Starting new HTTP connection (1): localhost # DEBUG:...:"GET /webhdfs/v1//t....tbl?op=OPEN&user.name=hdfs HTTP/1.1" 307 0 # INFO:...:Starting new HTTP connection (1): HOSTNAME.DOMAIN # Traceback (most recent call last): # ... # ...ConnectionError: ('Connection aborted.', error(111, 'Connection refused')) # Prefer the FQDN first for rpc-mgr-kerberized-test as newer krb5 requires FQDN. add_if_not_there() { grep -q "$2" $1 || echo "$2" | sudo tee -a $1 } add_if_not_there "/etc/hosts" "127.0.0.1 $(hostname) $(hostname -s)" # Add hostnames with multiple labels to allow matching wildcard TLS certificates. # Create names that map to v4/v6/dual localhost to help ipv6 testing. add_if_not_there "/etc/hosts" "127.0.0.1 ip4.impala.test ip46.impala.test" add_if_not_there "/etc/hosts" "::1 ip6.impala.test ip46.impala.test" # # In Docker, one can change /etc/hosts as above but not with sed -i. The error message is # "sed: cannot rename /etc/sedc3gPj8: Device or resource busy". The following lines are # basically sed -i but with cp instead of mv for -i part. NEW_HOSTS=$(mktemp) sed 's/127.0.1.1/127.0.0.1/g' /etc/hosts > "${NEW_HOSTS}" diff -u /etc/hosts "${NEW_HOSTS}" || true sudo cp "${NEW_HOSTS}" /etc/hosts rm "${NEW_HOSTS}" sudo mkdir -p /var/lib/hadoop-hdfs sudo chown $(whoami) /var/lib/hadoop-hdfs/ # TODO: restrict this to only the users it is needed for echo -e "\n* - nofile 1048576" | sudo tee -a /etc/security/limits.conf # Increase memlock for HDFS caching. On RedHat systems this defaults to 64 (KB). Ubuntu # uses systemd, which sets its own default. With Ubuntu 18.04 that default is 16 KB, # 20.04+ defaults to 64 MB. Set all to 64 MB for the current user; Impala test systems # require 10s of GBs of memory, so this setting should not be a problem. USER=${USER-$(id -un)} echo -e "${USER} - memlock 65536" | sudo tee /etc/security/limits.d/10-memlock.conf # Default on CentOS limits a user to 1024 or 4096 processes (threads) , which isn't # enough for minicluster with all of its friends. redhat7 sudo sed -i 's,\*\s*soft\s*nproc\s*[0-9]*$,* soft nproc unlimited,' \ /etc/security/limits.d/*-nproc.conf redhat8 echo -e "* soft nproc unlimited" | sudo tee -a /etc/security/limits.conf redhat9 echo -e "* soft nproc unlimited" | sudo tee -a /etc/security/limits.conf echo ">>> Checking out Impala" # If there is no Impala git repo, get one now if ! [[ -d "$IMPALA_HOME" ]] then time -p git clone https://gitbox.apache.org/repos/asf/impala.git "$IMPALA_HOME" fi cd "$IMPALA_HOME" SET_IMPALA_HOME="export IMPALA_HOME=$(pwd)" echo -e "\n$SET_IMPALA_HOME" >> ~/.bashrc eval "$SET_IMPALA_HOME" # Try to prepopulate the m2 directory to save time if [[ "${PREPOPULATE_M2_REPOSITORY:-true}" == true ]] ; then echo ">>> Populating m2 directory..." if ! bin/jenkins/populate_m2_directory.py ; then echo "Failed to prepopulate the m2 directory. Continuing..." fi else echo ">>> Skip populating m2 directory" fi setup_postgresql # Be careful about adding code after postgres initialization, it may fail (IMPALA-13802).