From 0eaff805e28dd4afac134f58b294732e414235ce Mon Sep 17 00:00:00 2001 From: Jim Apple Date: Sun, 23 Oct 2016 14:54:08 -0700 Subject: [PATCH] Add distcc infrastructure. This has been working for several months, and it it was written mainly by Casey Ching while he was at Cloudera working on Impala. Change-Id: Ia4bc78ad46dda13e4533183195af632f46377cae Reviewed-on: http://gerrit.cloudera.org:8080/4820 Reviewed-by: Jim Apple Tested-by: Internal Jenkins --- .gitignore | 2 +- bin/distcc/.gitignore | 1 + bin/distcc/README.md | 106 ++++++++++++++++++++++++++ bin/distcc/distcc.sh | 62 +++++++++++++++ bin/distcc/distcc_env.sh | 160 +++++++++++++++++++++++++++++++++++++++ 5 files changed, 330 insertions(+), 1 deletion(-) create mode 100644 bin/distcc/.gitignore create mode 100644 bin/distcc/README.md create mode 100755 bin/distcc/distcc.sh create mode 100644 bin/distcc/distcc_env.sh diff --git a/.gitignore b/.gitignore index 849ee61c0..e63f86381 100644 --- a/.gitignore +++ b/.gitignore @@ -13,7 +13,7 @@ org.eclipse.jdt.ui.prefs load-*-generated.sql bin/version.info -# Cloudera distcc options +# distcc options .impala_compiler_opts pprof.out diff --git a/bin/distcc/.gitignore b/bin/distcc/.gitignore new file mode 100644 index 000000000..ce71f709c --- /dev/null +++ b/bin/distcc/.gitignore @@ -0,0 +1 @@ +ld diff --git a/bin/distcc/README.md b/bin/distcc/README.md new file mode 100644 index 000000000..2de7d4ac7 --- /dev/null +++ b/bin/distcc/README.md @@ -0,0 +1,106 @@ +# Distcc +Distcc will speed up compilation by distributing compilation tasks to remote build +machines. The scripts in this folder make using distcc easier. + +# Requirements + +The only requirement you should need to be aware of is, the scripts in this folder were +only tested on Linux. If you are using OS X, things probably won't work out of the box. + +Assuming you are using Linux, if you use the scripts in this folder, there shouldn't be +any other requirements. The distcc program should be installed and configured +automatically. Still, understanding what is involved could be useful. + +**You shouldn't need to do any of this, this scripts do this for you.** + +1. Install distcc and ccache. Most Linux distros have these packages. The scripts will + install it if you have a yum or apt-get based system. Otherwise you should install + distcc and ccache yourself through whatever package manager your system uses. +1. Configure the remote distcc hosts. Set your environment variable BUILD_FARM to + "host1/limit1,lzo host2/limit2,lzo" and so on. +1. Your local compiler needs to be at the same path as it is on the remote build slaves. + That path is /opt/Impala-Toolchain//bin/gcc. In other words, make + sure the Impala toolchain is available at /opt/Impala-Toolchain. That can be done + through a symlink, and that's what the scripts will attempt to setup. + +# Usage + +### First time +1. Source bin/impala-config.sh in the Impala repo. Step #2 depends on this. + + source "$IMPALA_HOME"/bin/impala-config.sh + +1. Source "distcc_env.sh" in this directory. The script will attempt to install distcc + if needed. + + source "$IMPALA_AUX_TEST_HOME"/distcc/distcc_env.sh + +1. Run buildall.sh. The main purpose is to regenerate cmakefiles. + + cd "$IMPALA_HOME" + ./buildall.sh -skiptests -so # Do not use -noclean + + You should notice that the build runs quite a bit faster. + +### Incremental builds +At this point you no longer need to run the heavyweight buildall.sh. After editing files +you can either +``` +make -j$(distcc -j) +``` +or +``` +bin/make_impala.sh +``` + +### Switiching back to local compilation +If you want to compile a very small change, a local build might be faster. +``` +switch_compiler local +``` +to switch back +``` +switch_compiler distcc +``` + +### Switch to clang++ +Clang is faster and gives better error messages. This setup is still somewhat +experimental. +``` +switch_compiler clang +``` +to switch back +``` +switch_compiler gcc +``` + +### Second time +If you open a new terminal and attempt to build with "make" or "bin/make_impala.sh", +that will fail. To fix: +``` +source "$IMPALA_HOME"/bin/impala-config.sh # Skip if already done +source "$IMPALA_HOME"/bin/distcc/distcc_env.sh +``` + +# Setting up a new distcc server + +1. Install "distccd" and "ccache". +1. Configure distccd (edit /etc/sysconfig/distccd on a RHEL server) with the options + OPTIONS="--jobs 96 --allow YOUR.IP.ADDRESS.HERE --log-level=warn --nice=-15" + Where num jobs = 2x the number of cores on the machine. (2x is recommended by distcc.) +1. Start distcc. +1. Edit distcc_env.sh to include the new host. +1. Install all gcc and binutils versions from the toolchain into /opt/Impala-Toolchain. +1. ccache stores its cache in $HOME/.ccache. Assuming distcc is running as a non-root user + that has no $HOME, you must sudo mkdir /.ccache, then sudo chmod 777 /.ccache. +1. If distcc runs as "nobody", sudo -u nobody ccache -M 25G. This sets the size of the + cache to 25GB. Adjust to your taste. + +# Misc notes + +1. "pump" doesn't work. Many compilation attempts time out say something like "Include + server did not process the request in 3.8 seconds". distcc tries to copy 3rd party + headers to the remote hosts and that may be the problem. If we could get the include + server to use the remote 3rd party headers that should help. +1. Having a different local Linux OS on your development machine than on the distcc hosts + should be fine. diff --git a/bin/distcc/distcc.sh b/bin/distcc/distcc.sh new file mode 100755 index 000000000..a1136e8fb --- /dev/null +++ b/bin/distcc/distcc.sh @@ -0,0 +1,62 @@ +#!/bin/bash + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +if [[ -z "$DISTCC_HOSTS" || -z "$IMPALA_REAL_CXX_COMPILER" ]]; then + # This could be sourced here and the build would work but the parallelization (-j) + # should be wrong at this point and it's too late to fix. + DIR=$(dirname "$0") + echo "You must source '$DIR/distcc_env.sh' before attempting to build." 1>&2 + exit 1 +fi + +TOOLCHAIN_DIR=/opt/Impala-Toolchain +if [[ ! -d "$TOOLCHAIN_DIR" ]]; then + if [[ -n "$IMPALA_TOOLCHAIN" && -d "$IMPALA_TOOLCHAIN" ]]; then + if ! sudo -n -- ln -s "$IMPALA_TOOLCHAIN" "$TOOLCHAIN_DIR" &>/dev/null; then + echo The toolchain must be available at $TOOLCHAIN_DIR for distcc. \ + Try running '"sudo ln -s $IMPALA_TOOLCHAIN $TOOLCHAIN_DIR"'. 1>&2 + exit 1 + fi + fi + echo "The toolchain wasn't found at '$TOOLCHAIN_DIR' and IMPALA_TOOLCHAIN is not set." \ + Make sure the toolchain is available at $TOOLCHAIN_DIR and try again. 1>&2 + exit 1 +fi + +CMD= +CMD_POST_ARGS= +if $IMPALA_USE_DISTCC; then + CMD="distcc ccache" +fi + +GCC_ROOT="$TOOLCHAIN_DIR/gcc-$IMPALA_GCC_VERSION" +case "$IMPALA_REAL_CXX_COMPILER" in + gcc) CMD+=" $GCC_ROOT/bin/g++";; + clang) # Assume the compilation options were setup for gcc, which would happen using + # default build options. Now some additional options need to be added for clang. + CMD+=" $TOOLCHAIN_DIR/llvm-$IMPALA_LLVM_ASAN_VERSION/bin/clang++" + CMD+=" --gcc-toolchain=$GCC_ROOT" + # -Wno-unused-local-typedef needs to go after -Wall + # -Wno-error is needed, clang generates more warnings than gcc. + CMD_POST_ARGS+=" -Wno-unused-local-typedef -Wno-error";; + *) echo "Unexpected IMPALA_REAL_CXX_COMPILER: '$IMPALA_REAL_CXX_COMPILER'" 1>&2 + exit 1;; +esac + +exec $CMD "$@" $CMD_POST_ARGS diff --git a/bin/distcc/distcc_env.sh b/bin/distcc/distcc_env.sh new file mode 100644 index 000000000..173cc181a --- /dev/null +++ b/bin/distcc/distcc_env.sh @@ -0,0 +1,160 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# This file is intended to be sourced by a shell (zsh and bash have been tested). + +if [[ -z $BUILD_FARM ]] +then + echo "BUILD_FARM must be set to configure distcc" >&2 + return 1 +fi + +if [[ ! -z $ZSH_NAME ]]; then + DISTCC_ENV_DIR=$(cd $(dirname ${(%):-%x}) && pwd) +else + DISTCC_ENV_DIR=$(cd $(dirname "${BASH_SOURCE[0]}") && pwd) +fi + +function cmd_exists { + which $1 &>/dev/null +} + +INSTALLER= +if cmd_exists apt-get; then + INSTALLER=apt-get +elif cmd_exists yum; then + INSTALLER=yum +fi + +if ! cmd_exists distcc; then + echo distcc command not found, attempting installation + if [[ -z $INSTALLER ]] || ! sudo $INSTALLER -y install distcc; then + echo Unable to automatically install distcc. You need to install it manually. 1>&2 + return 1 + fi +fi + +# Install CCache if necessary. +if ! cmd_exists ccache; then + echo "ccache command not found, attempting installation" + if [[ -z $INSTALLER ]] || ! sudo $INSTALLER -y install ccache; then + echo "Unable to automatically install ccache" + return 1 + fi +fi + +# Don't include localhost in the list. It is already the slowest part of the build because +# it needs to do preprocessing and linking. There shouldn't be a need to add an extra +# compilation worker. +export DISTCC_HOSTS= +DISTCC_HOSTS+=" --localslots=$(nproc)" +DISTCC_HOSTS+=" --localslots_cpp=$(nproc)" +DISTCC_HOSTS+=" --randomize" +DISTCC_HOSTS+=" ${BUILD_FARM}" + +# The compiler that distcc.sh should use: gcc or clang. +: ${IMPALA_REAL_CXX_COMPILER=} +export IMPALA_REAL_CXX_COMPILER + +# Set to false to use local compilation instead of distcc. +: ${IMPALA_USE_DISTCC=} +export IMPALA_USE_DISTCC + +# Even after generating make files, some state about compiler options would only exist in +# environment vars. Any such vars should be saved to this file so they can be restored. +if [[ -z "$IMPALA_HOME" ]]; then + echo '$IMPALA_HOME must be set before sourcing this file.' 1>&2 + return 1 +fi +IMPALA_COMPILER_CONFIG_FILE="$IMPALA_HOME/.impala_compiler_opts" + +# Completely disable anything that could have been setup using this script and clean +# the make files. +function disable_distcc { + export IMPALA_CXX_COMPILER=default + export IMPALA_BUILD_THREADS=$(nproc) + save_compiler_opts + if ! clean_cmake_files; then + echo Failed to clean cmake files. 1>&2 + return 1 + fi + echo "distcc is not fully disabled, run 'buildall.sh' to complete the change." \ + "Run 'enable_distcc' to enable." +} + +function enable_distcc { + export IMPALA_CXX_COMPILER="$DISTCC_ENV_DIR"/distcc.sh + switch_compiler distcc gcc + export IMPALA_BUILD_THREADS=$(distcc -j) + if ! clean_cmake_files; then + echo Failed to clean cmake files. 1>&2 + return 1 + fi + echo "distcc is not fully enabled, run 'buildall.sh' to complete the change." \ + "Run 'disable_distcc' or 'switch_compiler local' to disable." +} + +# Cleans old CMake files, this is required when switching between distcc.sh and direct +# compilation. +function clean_cmake_files { + if [[ -z "$IMPALA_HOME" || ! -d "$IMPALA_HOME" ]]; then + echo IMPALA_HOME=$IMPALA_HOME is not valid. 1>&2 + return 1 + fi + # Copied from $IMPALA_HOME/bin/clean.sh. + find "$IMPALA_HOME" -iname '*cmake*' -not -name CMakeLists.txt \ + -not -path '*cmake_modules*' \ + -not -path '*thirdparty*' | xargs rm -rf +} + +function switch_compiler { + for ARG in "$@"; do + case "$ARG" in + "local") + IMPALA_USE_DISTCC=false + IMPALA_BUILD_THREADS=$(nproc);; + distcc) + IMPALA_USE_DISTCC=true + IMPALA_BUILD_THREADS=$(distcc -j);; + gcc) IMPALA_REAL_CXX_COMPILER=gcc;; + clang) IMPALA_REAL_CXX_COMPILER=clang;; + *) echo "Valid compiler options are: + 'local' - Don't use distcc and set -j value to $(nproc). (gcc/clang) remains unchanged. + 'distcc' - Use distcc and set -j value to $(distcc -j). (gcc/clang) remains unchanged. + 'gcc' - Use gcc. (local/distcc remains unchanged). + 'clang' - Use clang. (local/distcc remains unchanged)." 2>&1 + return 1;; + esac + done + save_compiler_opts +} + +function save_compiler_opts { + rm -f "$IMPALA_COMPILER_CONFIG_FILE" + cat < "$IMPALA_COMPILER_CONFIG_FILE" +IMPALA_CXX_COMPILER=$IMPALA_CXX_COMPILER +IMPALA_BUILD_THREADS=$IMPALA_BUILD_THREADS +IMPALA_USE_DISTCC=$IMPALA_USE_DISTCC +IMPALA_REAL_CXX_COMPILER=$IMPALA_REAL_CXX_COMPILER +EOF +} + +if [[ -e "$IMPALA_COMPILER_CONFIG_FILE" ]]; then + source "$IMPALA_COMPILER_CONFIG_FILE" +else + enable_distcc +fi