mirror of
https://github.com/apache/impala.git
synced 2026-01-06 15:01:43 -05:00
This is a revert of a revert, re-enabling parallel data load. It avoid the race condition by explicitly configuring the temporary directory in question in load-data.py. When the parallel data load change went in, we discovered a race with a signature of: java.io.FileNotFoundException: File /tmp/hadoop-jenkins/mapred/local/1508958341829_tmp does not exist The number in this path is milliseconds since the epoch, and the race occurs when two queries submitted to HiveServer2, running with the local runner, hit the same millisecond time stamp. The upstream bug is https://issues.apache.org/jira/browse/MAPREDUCE-6441, and I described the symptoms in https://issues.apache.org/jira/browse/MAPREDUCE-6992 (which is now marked as a dupe). I've tested this by running data load 5 times on the same machines where it failed before. I also ran data load manually and inspected the system to make sure that the temporary directories are getting created as expected in /tmp/impala-data-load-*. Change-Id: I60d65794da08de4bb3eb439a2414c095f5be0c10 Reviewed-on: http://gerrit.cloudera.org:8080/8405 Reviewed-by: Tim Armstrong <tarmstrong@cloudera.com> Tested-by: Impala Public Jenkins
87 lines
2.7 KiB
Bash
Executable File
87 lines
2.7 KiB
Bash
Executable File
#!/bin/bash
|
|
#
|
|
# Licensed to the Apache Software Foundation (ASF) under one
|
|
# or more contributor license agreements. See the NOTICE file
|
|
# distributed with this work for additional information
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
# to you under the Apache License, Version 2.0 (the
|
|
# "License"); you may not use this file except in compliance
|
|
# with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
#
|
|
# run-step helper function used by multiple scripts. To use in a bash script, just
|
|
# source this file.
|
|
|
|
# Function to run a build step that logs output to a file and only
|
|
# outputs if there is an error.
|
|
# Usage: run-step <step description> <log file name> <cmd> <arg1> <arg2> ...
|
|
# LOG_DIR must be set to a writable directory for logs.
|
|
|
|
function run-step {
|
|
local MSG=$1
|
|
shift
|
|
local LOG_FILE_NAME=$1
|
|
shift
|
|
|
|
if [ ! -d "${LOG_DIR}" ]; then
|
|
echo "LOG_DIR must be set to a valid directory: ${LOG_DIR}"
|
|
return 1
|
|
fi
|
|
local LOG=${LOG_DIR}/${LOG_FILE_NAME}
|
|
|
|
echo "${MSG} (logging to ${LOG})... "
|
|
echo "Log for command '$@'" > ${LOG}
|
|
START_TIME=$SECONDS
|
|
if ! "$@" >> ${LOG} 2>&1 ; then
|
|
ELAPSED_TIME=$(($SECONDS - $START_TIME))
|
|
echo " FAILED (Took: $(($ELAPSED_TIME/60)) min $(($ELAPSED_TIME%60)) sec)"
|
|
echo " '$@' failed. Tail of log:"
|
|
tail -n50 ${LOG}
|
|
return 1
|
|
fi
|
|
ELAPSED_TIME=$(($SECONDS - $START_TIME))
|
|
echo " ${MSG} OK (Took: $(($ELAPSED_TIME/60)) min $(($ELAPSED_TIME%60)) sec)"
|
|
}
|
|
|
|
# Array to manage background tasks.
|
|
declare -a RUN_STEP_PIDS
|
|
declare -a RUN_STEP_MSGS
|
|
|
|
# Runs the given step in the background. Many tasks may be started in the
|
|
# background, and all of them must be joined together with run-step-wait-all.
|
|
# No dependency management or maximums on number of tasks are provided.
|
|
function run-step-backgroundable {
|
|
MSG="$1"
|
|
run-step "$@" &
|
|
local pid=$!
|
|
echo "Started ${MSG} in background; pid $pid."
|
|
RUN_STEP_PIDS+=($pid)
|
|
RUN_STEP_MSGS+=("${MSG}")
|
|
}
|
|
|
|
# Wait for all tasks that were run with run-step-backgroundable.
|
|
# Fails if any of the background tasks has failed. Clears $RUN_STEP_PIDS.
|
|
function run-step-wait-all {
|
|
local ret=0
|
|
for idx in "${!RUN_STEP_PIDS[@]}"; do
|
|
pid="${RUN_STEP_PIDS[$idx]}"
|
|
msg="${RUN_STEP_MSGS[$idx]}"
|
|
|
|
if ! wait $pid; then
|
|
ret=1
|
|
echo "Background task $msg (pid $pid) failed."
|
|
fi
|
|
done
|
|
RUN_STEP_PIDS=()
|
|
RUN_STEP_MSGS=()
|
|
return $ret
|
|
}
|