From c3dc7f9667564a4ffb9644e6f3d0103c007cdc10 Mon Sep 17 00:00:00 2001 From: Michael Smith Date: Wed, 22 Oct 2025 13:23:22 -0700 Subject: [PATCH] IMPALA-13147: Limit concurrency of link jobs Configure separate compile and link pools for ninja. Configures link parallelism based on expected memory use, which can be reduced by setting IMPALA_MINIMAL_DEBUG_INFO=true or IMPALA_SPLIT_DEBUG_INFO=true. Adds IMPALA_MAKE_CMD to simplify using the ninja build tool for all make operations in scripts. Install ninja on Ubuntu. Adds a '-make' option to buildall.sh to force using 'make'. Adds MOLD_JOBS=1 to avoid overloading the system when trying 'mold' and linking test binaries. However 'mold' is not selected as the default due to test failures around SASL/GSSAPI (see IMPALA-14527). Switches bin/jenkins/all-tests.sh to use ninja and removes the guard in bootstrap_development.sh limiting IMPALA_BUILD_THREADS as it's no longer needed with ninja. SKIP_BE_TEST_PATTERN in run-backend-tests is unused (only used with TARGET_FILESYSTEM=local) so I don't attempt to make it work with ninja. Tested with local 'IMPALA_SPLIT_DEBUG_INFO=true buildall.sh -skiptests' with default (make) and IMPALA_MAKE_CMD=ninja. Change-Id: I0952dc19ace5c9c42bed0d2ffb61499656c0a2db Reviewed-on: http://gerrit.cloudera.org:8080/23572 Reviewed-by: Joe McDonnell Reviewed-by: Pranav Lodha Tested-by: Impala Public Jenkins --- CMakeLists.txt | 6 ++++++ README-build.md | 6 ++++-- bin/bootstrap_build.sh | 2 +- bin/bootstrap_development.sh | 10 ---------- bin/clean.sh | 2 +- bin/impala-config.sh | 33 ++++++++++++++++++++++++++++----- bin/jenkins/all-tests.sh | 3 +++ bin/run-backend-tests.sh | 13 +++++++------ buildall.sh | 5 ++++- testdata/bin/copy-udfs-udas.sh | 2 +- 10 files changed, 55 insertions(+), 27 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f19d443fc..9aaa132dc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -36,6 +36,12 @@ set(IMPALA_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS}) # Build compile commands database set(CMAKE_EXPORT_COMPILE_COMMANDS ON) +# Configure ninja build pools +set(CMAKE_JOB_POOLS compilation_pool=$ENV{IMPALA_BUILD_THREADS} + link_pool=$ENV{IMPALA_LINK_THREADS}) +set(CMAKE_JOB_POOL_COMPILE compilation_pool) +set(CMAKE_JOB_POOL_LINK link_pool) + # Codegen-dependent executables need to be linked with -rdynamic; otherwise LLVM # can't find dependent symbols at runtime. # diff --git a/README-build.md b/README-build.md index 93d2dc181..7341d5c31 100644 --- a/README-build.md +++ b/README-build.md @@ -57,11 +57,13 @@ can do so through the environment variables and scripts listed below. | Environment variable | Default value | Description | |----------------------|---------------|-------------| -| IMPALA_BUILD_THREADS | "8" or set to number of processors by default. | Used for make -j and distcc -j settings. | +| IMPALA_BUILD_THREADS | Number of processors. | Used for make -j and distcc -j settings. | +| IMPALA_LINK_THREADS | Bounded based on available memory. | Used for ninja. | +| IMPALA_MAKE_CMD | "make" | Make tool to use by default, options are make or ninja. | | IMPALA_MAKE_FLAGS | "" | Any extra settings to pass to make. Also used when copying udfs / udas into HDFS. | | USE_SYSTEM_GCC | "0" | If set to any other value, directs cmake to not set GCC_ROOT, CMAKE_C_COMPILER, CMAKE_CXX_COMPILER, as well as setting TOOLCHAIN_LINK_FLAGS | | IMPALA_CXX_COMPILER | "default" | Used by cmake (cmake_modules/toolchain and clang_toolchain.cmake) to select gcc / clang | -| IMPALA_LINKER. | "gold" | Specifies the linker to use. | +| IMPALA_LINKER | "gold" | Specifies the linker to use; options are "gold", "mold", or "ld". | | IS_OSX | "false" | (Experimental) currently only used to disable Kudu. | ## Dependencies diff --git a/bin/bootstrap_build.sh b/bin/bootstrap_build.sh index 6ef7d6ce1..2dde710bd 100755 --- a/bin/bootstrap_build.sh +++ b/bin/bootstrap_build.sh @@ -36,7 +36,7 @@ sudo -E apt-get --quiet update # unversioned python-dev and python-setuptools are not available on newer releases # that don't support Python 2. Add them only when they exist for the platform, # otherwise set Python 3 to be the default Python version. -PACKAGES='g++ gcc git libsasl2-dev libssl-dev make +PACKAGES='g++ gcc git libsasl2-dev libssl-dev make ninja-build python3-dev python3-setuptools python3-venv libffi-dev language-pack-en libkrb5-dev krb5-admin-server krb5-kdc krb5-user libxml2-dev libxslt-dev wget' diff --git a/bin/bootstrap_development.sh b/bin/bootstrap_development.sh index 1b1009f83..f4b0e5e9a 100755 --- a/bin/bootstrap_development.sh +++ b/bin/bootstrap_development.sh @@ -54,16 +54,6 @@ source "${BINDIR}/bootstrap_system.sh" export MAX_PYTEST_FAILURES=0 source bin/impala-config.sh > /dev/null 2>&1 -BOUNDED_CONCURRENCY=$((AVAILABLE_MEM / 4)) -if [[ $AVAILABLE_MEM -lt 4 ]]; then - echo "Insufficient memory ($AVAILABLE_MEM GB) to link Impala test binaries" - echo "Increase memory, or run buildall.sh -format -testdata -notests" - exit 1 -elif [[ $BOUNDED_CONCURRENCY -lt $IMPALA_BUILD_THREADS ]]; then - echo "Bounding concurrency to $BOUNDED_CONCURRENCY for link phase" - IMPALA_BUILD_THREADS=$BOUNDED_CONCURRENCY -fi - time -p ./buildall.sh -format -testdata -skiptests # To then run the tests: diff --git a/bin/clean.sh b/bin/clean.sh index 8a4f45371..e95f5308d 100755 --- a/bin/clean.sh +++ b/bin/clean.sh @@ -27,7 +27,7 @@ setup_report_build_error # If the project was never build, no Makefile will exist and thus make clean will fail. # Combine the make command with the bash noop to always return true. -"${MAKE_CMD:-make}" clean || : +"${MAKE_CMD:-${IMPALA_MAKE_CMD}}" clean || : # clean Java projects pushd "${IMPALA_HOME}/java" diff --git a/bin/impala-config.sh b/bin/impala-config.sh index 62a42142f..532735b8a 100755 --- a/bin/impala-config.sh +++ b/bin/impala-config.sh @@ -580,6 +580,9 @@ export LIB_JAVA=$(find "${JAVA_HOME}/" -name libjava.so | head -1) export LIB_JSIG=$(find "${JAVA_HOME}/" -name libjsig.so | head -1) export LIB_JVM=$(find "${JAVA_HOME}/" -name libjvm.so | head -1) +# Default to make, but allow overriding to e.g. ninja. +export IMPALA_MAKE_CMD=${IMPALA_MAKE_CMD:-make} + ######################################################################################### # Below here are variables that can be overridden by impala-config-*.sh and environment # # vars, variables computed based on other variables, and variables that cannot be # @@ -608,6 +611,10 @@ export USE_SYSTEM_GCC=${USE_SYSTEM_GCC-0} # TODO: Add support for lld as well export IMPALA_LINKER=${IMPALA_LINKER-gold} +# Limit mold to a single job to avoid excessive memory consumption while fully utilizing +# available CPUs. +export MOLD_JOBS=${IMPALA_MOLD_JOBS-1} + # Override the default compiler by setting a path to the new compiler. The default # compiler depends on USE_SYSTEM_GCC and IMPALA_GCC_VERSION. The intended use case # is to set the compiler to distcc, in that case the user would also set @@ -1023,7 +1030,7 @@ export IMPALA_DATASET_DIR="$IMPALA_HOME/testdata/datasets" export IMPALA_AUX_DATASET_DIR="$IMPALA_AUX_TEST_HOME/testdata/datasets" export IMPALA_COMMON_DIR="$IMPALA_HOME/common" export PATH="$IMPALA_TOOLCHAIN_PACKAGES_HOME/gdb-$IMPALA_GDB_VERSION/bin:$PATH" -export PATH="$IMPALA_TOOLCHAIN_PACKAGES_HOME/cmake-$IMPALA_CMAKE_VERSION/bin/:$PATH" +export PATH="$IMPALA_TOOLCHAIN_PACKAGES_HOME/cmake-$IMPALA_CMAKE_VERSION/bin:$PATH" export PATH="$IMPALA_HOME/bin:$PATH" export HADOOP_CONF_DIR="$IMPALA_FE_DIR/src/test/resources" @@ -1165,16 +1172,31 @@ else CGROUP_MEM_LIMIT=8589934591 # max int64 bytes in GB fi AVAILABLE_MEM=$((AVAILABLE_MEM > $CGROUP_MEM_LIMIT ? $CGROUP_MEM_LIMIT : $AVAILABLE_MEM)) -BOUNDED_CONCURRENCY=$((AVAILABLE_MEM / 2)) -if [[ $AVAILABLE_MEM -lt 2 ]]; then +if [[ $AVAILABLE_MEM -lt 5 ]]; then echo "Insufficient memory ($AVAILABLE_MEM GB) to build Impala" exit 1 -elif [[ $BOUNDED_CONCURRENCY -lt $CORES ]]; then +fi +BOUNDED_CONCURRENCY=$((AVAILABLE_MEM / 2)) +if [[ $BOUNDED_CONCURRENCY -lt $CORES ]]; then echo "Bounding concurrency for available memory ($AVAILABLE_MEM GB)" else BOUNDED_CONCURRENCY=$CORES fi -export IMPALA_BUILD_THREADS=${IMPALA_BUILD_THREADS-"${BOUNDED_CONCURRENCY}"} +export IMPALA_BUILD_THREADS=${IMPALA_BUILD_THREADS:-"${BOUNDED_CONCURRENCY}"} +# Limit number of links; only works with ninja builds. +# Determines number of concurrent links based on expected memory use. +if [[ "$IMPALA_MINIMAL_DEBUG_INFO" == "true" || + "$IMPALA_SPLIT_DEBUG_INFO" == "true" ]]; then + MEM_PER_LINK=2 +else + MEM_PER_LINK=5 +fi +BOUNDED_LINKS=$((AVAILABLE_MEM / MEM_PER_LINK)) +if [[ $BOUNDED_LINKS -gt $IMPALA_BUILD_THREADS ]]; then + # Avoid regressing behavior if IMPALA_BUILD_THREADS is already set to a low value. + BOUNDED_LINKS=${IMPALA_BUILD_THREADS} +fi +export IMPALA_LINK_THREADS=${IMPALA_LINK_THREADS:-"${BOUNDED_LINKS}"} # Additional flags to pass to make or ninja. export IMPALA_MAKE_FLAGS=${IMPALA_MAKE_FLAGS-} @@ -1258,6 +1280,7 @@ echo "IMPALA_OBS_VERSION = $IMPALA_OBS_VERSION" echo "IMPALA_SYSTEM_PYTHON2 = $IMPALA_SYSTEM_PYTHON2" echo "IMPALA_SYSTEM_PYTHON3 = $IMPALA_SYSTEM_PYTHON3" echo "IMPALA_BUILD_THREADS = $IMPALA_BUILD_THREADS" +echo "IMPALA_LINK_THREADS = $IMPALA_LINK_THREADS" echo "NUM_CONCURRENT_TESTS = $NUM_CONCURRENT_TESTS" echo "USE_CUSTOM_IMPALA_BASE_IMAGE = $USE_CUSTOM_IMPALA_BASE_IMAGE" echo "IMPALA_CUSTOM_DOCKER_BASE = $IMPALA_CUSTOM_DOCKER_BASE" diff --git a/bin/jenkins/all-tests.sh b/bin/jenkins/all-tests.sh index 6624a7dae..b74da5545 100644 --- a/bin/jenkins/all-tests.sh +++ b/bin/jenkins/all-tests.sh @@ -31,6 +31,9 @@ export IMPALA_MAVEN_OPTIONS="-U" # Allow unlimited pytest failures export MAX_PYTEST_FAILURES=0 +# Use ninja for better link concurrency. +export IMPALA_MAKE_CMD=ninja + # When UBSAN_FAIL is "death", the logs are monitored for UBSAN errors. Any errors will # then cause this script to exit. # diff --git a/bin/run-backend-tests.sh b/bin/run-backend-tests.sh index 9952496f1..ba2a7d981 100755 --- a/bin/run-backend-tests.sh +++ b/bin/run-backend-tests.sh @@ -28,11 +28,6 @@ export GTEST_OUTPUT="xml:$IMPALA_BE_TEST_LOGS_DIR/" # The backend unit tests currently do not work when HEAPCHECK is enabled. export HEAPCHECK= -BE_TEST_ARGS="" -if [[ -n "$SKIP_BE_TEST_PATTERN" ]]; then - BE_TEST_ARGS="-E ${SKIP_BE_TEST_PATTERN}" -fi - cd ${IMPALA_BE_DIR} . ${IMPALA_HOME}/bin/set-classpath.sh cd .. @@ -44,4 +39,10 @@ export ASAN_OPTIONS="disable_coredump=0:unmap_shadow_on_exit=1" export UBSAN_OPTIONS="disable_coredump=0:unmap_shadow_on_exit=1" export PATH="${IMPALA_TOOLCHAIN_PACKAGES_HOME}/llvm-${IMPALA_LLVM_VERSION}/bin:${PATH}" -"${MAKE_CMD:-make}" test ARGS="${BE_TEST_ARGS}" \ No newline at end of file +if [[ -n "$SKIP_BE_TEST_PATTERN" ]]; then + # Requires make, will fail with ninja. + "${MAKE_CMD:-${IMPALA_MAKE_CMD}}" test ARGS="-E ${SKIP_BE_TEST_PATTERN}" +else + # Ninja doesn't accept additional parameters, so omit ARGS. + "${MAKE_CMD:-${IMPALA_MAKE_CMD}}" test +fi diff --git a/buildall.sh b/buildall.sh index f95b291ec..70f905c51 100755 --- a/buildall.sh +++ b/buildall.sh @@ -80,7 +80,7 @@ BUILD_DEBUG_NOOPT=0 BUILD_SHARED_LIBS=0 UDF_DEVEL=0 # Export MAKE_CMD so it is visible in scripts that invoke make, e.g. copy-udfs-udas.sh -export MAKE_CMD=make +export MAKE_CMD=${IMPALA_MAKE_CMD:-make} # Defaults that can be picked up from the environment, but are overridable through the # commandline. @@ -203,6 +203,9 @@ do -ninja) MAKE_CMD=ninja ;; + -make) + MAKE_CMD=make + ;; -cmake_only) GEN_CMAKE_ONLY=1 ;; diff --git a/testdata/bin/copy-udfs-udas.sh b/testdata/bin/copy-udfs-udas.sh index e73f38d1e..581df979a 100755 --- a/testdata/bin/copy-udfs-udas.sh +++ b/testdata/bin/copy-udfs-udas.sh @@ -49,7 +49,7 @@ done if [ $BUILD -eq 1 ] then pushd "${IMPALA_HOME}" - "${MAKE_CMD:-make}" ${IMPALA_MAKE_FLAGS} "-j${IMPALA_BUILD_THREADS:-4}" \ + "${MAKE_CMD:-${IMPALA_MAKE_CMD}}" ${IMPALA_MAKE_FLAGS} "-j${IMPALA_BUILD_THREADS:-4}" \ TestUdas TestUdfs test-udfs-ir udfsample udasample udf-sample-ir uda-sample-ir cd "${IMPALA_HOME}/java/test-corrupt-hive-udfs" "${IMPALA_HOME}/bin/mvn-quiet.sh" package