#!/bin/bash # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. # # Entrypoint code for test-with-docker.py containers. test-with-docker.py # will create Docker containers with this script as the entrypoint, # with a variety of arguments. See test-with-docker.py for a more # general overview. # # This assumes that the following are already mounted inside # the container: # /etc/localtime -> /mnt/localtime # Helps timestamps be in the time zone of the host # $IMPALA_HOME [a git repo of Impala] -> /repo # Used to check out Impala afresh # $IMPALA_HOME/logs/docker// -> /logs # Used to save logs out to host. # represents the --name passed into # test-with-docker for the test run. # indicates which specific container is being run. # ~/.ccache [configurable] -> /ccache # Used to speed up builds. # # Usage: # entrypoint.sh build # entrypoint.sh test_suite # where is one of: BE_TEST JDBC_TEST CLUSTER_TEST # EE_TEST_SERIAL EE_TEST_PARALLEL # Starts or stops postgres # The centos:7 Docker image doesn't allow systemctl to start postgresql, # so we start it explicitly with pg_ctl. function _pg_ctl() { if [ -f /etc/redhat-release ]; then if which systemctl; then sudo -u postgres PGDATA=/var/lib/pgsql/data bash -c "pg_ctl $1 -w --timeout=120 >> /var/lib/pgsql/pg.log 2>&1" return fi fi sudo service postgresql $1 } # Install Python2 with pip2 and make them the default Python and pip commands # on RedHat / CentOS 8. # This has no notion of "default" Python, and can install both Python2 and Python3 # side by side. Impala currently needs Python2 as the default version. # The function is adaptive; it performs only the necessary steps; it shares the installer # logic with bin/bootstrap_system.sh function install_python2_for_centos8() { if command -v python && [[ $(python --version 2>&1 | cut -d ' ' -f 2) =~ 2\. ]]; then echo "We have Python 2.x"; else if ! command -v python2; then # Python2 needs to be installed dnf install -y python2 fi # Here Python2 is installed, but is not the default Python. # 1. Link pip's version to Python's version alternatives --add-slave python /usr/bin/python2 /usr/bin/pip pip /usr/bin/pip2 alternatives --add-slave python /usr/libexec/no-python /usr/bin/pip pip \ /usr/libexec/no-python # 2. Set Python2 (with pip2) to be the system default. alternatives --set python /usr/bin/python2 fi # Here the Python2 runtime is already installed, add the dev package dnf -y install python2-devel } # Boostraps the container by creating a user and adding basic tools like Python and git. # Takes a uid as an argument for the user to be created. function build() { # Handy for testing. if [[ $TEST_TEST_WITH_DOCKER ]]; then # We sleep busily so that CPU metrics will show usage, to # better exercise the timeline code. echo sleeping busily for 4 seconds bash -c 'while [[ $SECONDS -lt 4 ]]; do :; done' return fi # Configure timezone, so any timestamps that appear are coherent with the host. configure_timezone # Assert we're superuser. [ "$(id -u)" = 0 ] if id $1 2> /dev/null; then echo "User with id $1 already exists. Please run this as a user id missing from " \ "the base Ubuntu container." echo echo "Container users:" paste <(cut -d : -f3 /etc/passwd) <(cut -d : -f1 /etc/passwd) | sort -n exit 1 fi if command -v apt-get > /dev/null; then apt-get update # The 'python' package doesn't exist on Ubuntu 22, so this installs python3. # TODO: It might not be necessary to install python here, as the container # will invoke bootstrap_system.sh. apt-get install -y sudo git python3 elif grep 'release 8\.' /etc/redhat-release; then # WARNING: Install the following packages one by one! # Installing them in a common transaction breaks something inside yum/dnf, # and the subsequent step installing Python2 will fail with a GPG signature error. dnf -y install sudo dnf -y install which dnf -y install git-core install_python2_for_centos8 else yum -y install which sudo git python fi if ! id impdev; then # Adduser is slightly different on CentOS and Ubuntu if which apt-get; then adduser --disabled-password --gecos "" --uid $1 impdev else adduser --uid $1 impdev fi echo "impdev ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers fi ulimit -a su impdev -c "$0 build_impdev" } # Sets up Impala environment function impala_environment() { pushd /home/impdev/Impala export IMPALA_HOME=/home/impdev/Impala source bin/impala-config.sh popd } # Starts SSH and PostgreSQL; configures container as necessary; # prepares Kudu for starting. function boot_container() { pushd /home/impdev/Impala # Make log directories. This is typically done in buildall.sh. mkdir -p logs/be_tests logs/fe_tests/coverage logs/ee_tests logs/custom_cluster_tests # Update /etc/hosts to remove the entry for the unique docker hostname, # and instead point it to 127.0.0.1. Otherwise, HttpFS returns Location: # redirects to said hostname, but the relevant datanode isn't listening # on the wildcard address. bootstrap_system.sh does this as well, but # Docker creates a new /etc/hosts every time a container is created, so # this needs to be done here as well. # # "sed -i" in place doesn't work on Docker, because /etc/hosts is a bind mount. sed -e /$(hostname)/d /etc/hosts > /tmp/hosts echo "127.0.0.1 $(hostname) $(hostname -s)" >> /tmp/hosts sudo cp /tmp/hosts /etc/hosts echo Hostname: $(hostname) echo Hosts file: cat /etc/hosts popd } function start_minicluster { # The subshell here avoids the verbose output from set -x. (echo ">>> Starting PostgreSQL and SSH") 2> /dev/null pushd /home/impdev/Impala # Required for metastore _pg_ctl start # Required for starting HBase if [ -f /etc/redhat-release ]; then if which systemctl; then # centos7 doesn't support systemd running inside of docker to start daemons sudo /usr/sbin/sshd else sudo service sshd start fi else sudo service ssh start fi (echo ">>> Copying Kudu Data") 2> /dev/null # Move around Kudu's WALs to avoid issue with Docker filesystems (aufs and # overlayfs) that don't support os.rename(2) on directories, which Kudu # requires. We make a fresh copy of the data, in which case rename(2) works # presumably because there's only one layer involved. See # https://issues.apache.org/jira/browse/KUDU-1419. set -x pushd /home/impdev/Impala/testdata for x in cluster/cdh*/node-*/var/lib/kudu/*/wal; do echo $x # This mv takes time, as it's actually copying into the latest layer. mv $x $x-orig mkdir $x mv $x-orig/* $x rmdir $x-orig done popd # Wait for postgresql to really start; if it doesn't, Hive Metastore will fail to start. for i in {1..120}; do echo connecting to postgresql attempt $i if sudo -u postgres psql -c "select 1"; then break else sleep 2 fi done sudo -u postgres psql -c "select 1" (echo ">>> Starting mini cluster") 2> /dev/null testdata/bin/run-all.sh popd } # Runs bootstrap_system.sh and then builds Impala. function build_impdev() { # Assert we're impdev now. [ "$(id -un)" = impdev ] # Bump "Max processes" ulimit to the hard limit; default # on CentOS 6 can be 1024, which isn't enough for minicluster. ulimit -u $(cat /proc/self/limits | grep 'Max processes' | awk '{ print $4 }') ulimit -a # Link in ccache from host. ln -s /ccache /home/impdev/.ccache # Instead of doing a full "git clone" of /repo, which is the host's checkout, # we only fetch one branch, without tags. This keeps the checkout # considerably lighter. mkdir /home/impdev/Impala pushd /home/impdev/Impala git init git fetch /git_common_dir --no-tags "$GIT_HEAD_REV" git checkout -b test-with-docker FETCH_HEAD # Link in logs. Logs are on the host since that's the most important thing to # look at after the tests are run. ln -sf /logs logs bin/bootstrap_system.sh impala_environment # Builds Impala and loads test data. # Note that IMPALA-6494 prevents us from using shared library builds, # which are smaller and thereby speed things up. We use "-notests" # to avoid building backend tests, which are sizable, and # can be built when executing those tests. We use "-noclean" to # avoid deleting the log for this invocation which is in logs/, # and, this is a first build anyway. if ! ./buildall.sh -noclean -format -testdata -notests; then echo "Build + dataload failed!" copy_cluster_logs return 1 fi # We make one exception to "-notests": # test_insert_parquet.py, which is used in all the end-to-end test # shards, depends on this binary. We build it here once, # instead of building it during the startup of each container running # a subset of E2E tests. Building it here is also a lot faster. if ! make -j$(nproc) --load-average=$(nproc) parquet-reader impala-profile-tool; then echo "Impala profile tool build failed!" copy_cluster_logs return 1 fi # Dump current memory usage to logs, before shutting things down. memory_usage || true # Shut down things cleanly. testdata/bin/kill-all.sh || true if ! hardlink_duplicate_hdfs_data; then echo "Hardlink duplicate HDFS data failed!" copy_cluster_logs return 1 fi copy_cluster_logs # Shutting down PostgreSQL nicely speeds up it's start time for new containers. _pg_ctl stop # Clean up things we don't need to reduce image size find be -name '*.o' -execdir rm '{}' + # ~1.6GB # Clean up dangling symlinks. These (typically "cluster/cdh*-node-*") # may point to something inside a container that no longer exists # and can confuse Jenkins. find /logs -xtype l -execdir rm '{}' ';' popd } # "Compress" HDFS data by de-duplicating blocks. As a result of # having three datanodes, our data load is 3x larger than it needs # to be. To alleviate this (to the tune of ~20GB savings), we # use hardlinks to link together the identical blocks. This is absolutely # taking advantage of an implementation detail of HDFS. function hardlink_duplicate_hdfs_data() { echo "Hardlinking duplicate HDFS block data." set +x for x in $(find testdata/cluster/*/node-1/data/dfs/dn/current/ -name 'blk_*[0-9]'); do for n in 2 3; do xn=${x/node-1/node-$n} if [ -f $xn ]; then rm $xn ln $x $xn fi done done set -x } # Prints top 20 RSS consumers (and other, total), in megabytes Common culprits # are Java processes without Xmx set. Since most things don't reclaim memory, # this is a decent proxy for peak memory usage by long-lived processes. function memory_usage() { ( echo "Top 20 memory consumers (RSS in MBs)" sudo ps -axho rss,args | \ sed -e 's/^ *//' | \ sed -e 's, ,\t,' | \ sort -nr | \ awk -F'\t' ' FNR < 20 { print $1/1024.0, $2; total += $1/1024.0 } FNR >= 20 { other+= $1/1024.0; total += $1/1024.0 } END { if (other) { print other, "-- other --" }; print total, "-- total --" }' ) >& /logs/memory_usage.txt } # Some components like hdfs, yarn, kudu creates their log in # testdata/cluster/cdh/node-/var/log/ folder # these log folders are symlinked to logs/cluster/ folder # remove symlinks and copy these logs to logs/cluster/ function copy_cluster_logs() { echo ">>> Copy cluster logs..." pushd /home/impdev/Impala for x in testdata/cluster/cdh*/node-*/var/log/; do echo $x if [ -d $x ]; then CDH_VERSION=`echo $x | sed "s#testdata/cluster/\(.*\)/node-.*#\1#"` NODE_NUMBER=`echo $x | sed "s#testdata/cluster/cdh.*/\(.*\)/var.*#\1#"` rm -rf logs/cluster/${CDH_VERSION}-${NODE_NUMBER} mkdir -p logs/cluster/${CDH_VERSION}-${NODE_NUMBER} cp -R $x/* logs/cluster/${CDH_VERSION}-${NODE_NUMBER} fi done popd } # Runs a suite passed in as the first argument. Tightly # coupled with Impala's run-all-tests and the suite names. # from test-with-docker.py. # # Before running tests, starts up the minicluster. function test_suite() { cd /home/impdev/Impala # These test suites are for testing. if [[ $1 == NOOP ]]; then # Sleep busily for 10 seconds. bash -c 'while [[ $SECONDS -lt 10 ]]; do :; done' return 0 fi if [[ $1 == NOOP_FAIL ]]; then return 1 fi if [[ $1 == NOOP_SLEEP_FOREVER ]]; then # Handy to test timeouts. while true; do sleep 60; done fi # Assert that we're running as impdev [ "$(id -un)" = impdev ] # Assert that /home/impdev/Impala/logs is a symlink to /logs. [ "$(readlink /home/impdev/Impala/logs)" = /logs ] boot_container impala_environment if [[ ${REBUILD_ASAN:-false} = true ]]; then # Note: we're not redoing data loading. SKIP_TOOLCHAIN_BOOTSTRAP=true ./buildall.sh -noclean -notests -asan fi # Build the BE test binaries if needed. if [[ $1 = BE_TEST* ]]; then make -j$(nproc) --load-average=$(nproc) be-test be-benchmarks fi if [[ $1 == RAT_CHECK ]]; then # Runs Apache RAT (a license checker) git archive --prefix=rat/ -o rat-impala.zip HEAD wget --quiet https://archive.apache.org/dist/creadur/apache-rat-0.12/apache-rat-0.12-bin.tar.gz tar xzf apache-rat-0.12-bin.tar.gz java -jar apache-rat-0.12/apache-rat-0.12.jar -x rat-impala.zip > logs/rat.xml bin/check-rat-report.py bin/rat_exclude_files.txt logs/rat.xml return $? fi # Start the minicluster start_minicluster # By default, the JVM will use 1/4 of your OS memory for its heap size. For a # long-running test, this will delay GC inside of impalad's leading to # unnecessarily large process RSS footprints. To combat this, we # set a small initial heap size, and then cap it at a more reasonable # size. The small initial heap sizes help for daemons that do little # in the way of JVM work (e.g., the 2nd and 3rd impalad's). # Note that "test_insert_large_string" fails at 2g and 3g, so the suite that # includes it (EE_TEST_PARALLEL) gets additional memory. # Note that we avoid using TEST_START_CLUSTER_ARGS="--jvm-args=..." # because it gets flattened along the way if we need to provide # more than one Java argument. We use JAVA_TOOL_OPTIONS instead. JVM_HEAP_MAX_GB=2 if [[ $1 = EE_TEST_PARALLEL ]]; then JVM_HEAP_MAX_GB=4 elif [[ $1 = EE_TEST_PARALLEL_EXHAUSTIVE ]]; then JVM_HEAP_MAX_GB=8 fi JAVA_TOOL_OPTIONS="-Xms512M -Xmx${JVM_HEAP_MAX_GB}G" # Similarly, bin/start-impala-cluster typically configures the memlimit # to be 80% of the machine memory, divided by the number of daemons. # If multiple containers are to be run simultaneously, this is scaled # down in test-with-docker.py (and further configurable with --impalad-mem-limit-bytes) # and passed in via $IMPALAD_MEM_LIMIT_BYTES to the container. There is a # relationship between the number of parallel tests that can be run by py.test and this # limit. export TEST_START_CLUSTER_ARGS="--impalad_args=--mem_limit=$IMPALAD_MEM_LIMIT_BYTES" export MAX_PYTEST_FAILURES=0 # Asserting that these should are all set (to either true or false as strings). # This is how run-all.sh chooses between them. [[ $FE_TEST && $BE_TEST && $EE_TEST && $JDBC_TEST && $CLUSTER_TEST ]] ret=0 # Run tests. (echo ">>> $1: Starting run-all-test") 2> /dev/null if ! time -p bash -x bin/run-all-tests.sh; then ret=1 echo "Tests $1 failed!" else echo "Tests $1 succeeded!" fi # Save memory usage after tests have run but before shutting down the cluster. memory_usage || true # Oddly, I've observed bash fail to exit (and wind down the container), # leading to test-with-docker.py hitting a timeout. Killing the minicluster # daemons fixes this. testdata/bin/kill-all.sh || true copy_cluster_logs return $ret } # It's convenient (for log files to be legible) for the container # to have the host timezone. However, /etc/localtime is finnicky # (see localtime(5)) and mounting it to the host /etc/localtime or # symlinking it there doesn't always work. Instead, we expect # $LOCALTIME_LINK_TARGET to be set to a path in /usr/share/zoneinfo. function configure_timezone() { if [ -e "${LOCALTIME_LINK_TARGET}" ]; then ln -sf "${LOCALTIME_LINK_TARGET}" /etc/localtime # Only Debian-based distros have this file. if [ -f /etc/timezone ]; then echo "${LOCALTIME_LINK_TARGET}" | sed -e 's,.*zoneinfo/,,' > /etc/timezone fi else echo '$LOCALTIME_LINK_TARGET not configured.' 1>&2 fi } # Exposes a shell, with the container booted with # a minicluster. function shell() { echo "Starting minicluster and Impala." # Logs is typically a symlink; remove it if so. rm logs || true mkdir -p logs boot_container impala_environment start_minicluster bin/start-impala-cluster.py cat <<"EOF" ========================================================== Welcome to the Impala development environment. The "minicluster" is running; i.e., HDFS, HBase, Hive, etc. are running. $KUDU_MSG To get started, perhaps run: impala-shell.sh -q 'select count(*) from tpcds.web_page' ========================================================== EOF exec bash } function main() { set -e # Run given command CMD="$1" shift # Treat shell specialy to avoid the extra logging and | cat below. if [[ $CMD = "shell" ]]; then shell # shell shoud have exec'd, so if we get here, it's a failure. exit 1 fi echo ">>> ${CMD} $@ (begin)" # Dump environment, for debugging env | grep -vE "AWS_(SECRET_)?ACCESS_KEY" ulimit -a set -x # The "| cat" here avoids "set -e"/errexit from exiting the # script right away. "${CMD}" "$@" | cat ret=${PIPESTATUS[0]} set +x echo ">>> ${CMD} $@ ($ret) (end)" exit $ret } # Run main() unless we're being sourced. if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then main "$@" fi