mirror of
https://github.com/apache/impala.git
synced 2025-12-19 18:12:08 -05:00
IMPALA-9618: fix some usability issues with dev env
Automatically assume IMPALA_HOME is the source directory in a couple of places. Delete the cache_tables.py script and MINI_DFS_BASE_DATA_DIR config var which had both bit-rotted and were unused. Allow setting IMPALA_CLUSTER_NODES_DIR to put the minicluster nodes, most important the data, in a different location, e.g. on a different filesystem. Testing: I set up a dev environment using this code and was able to load data and run some tests. Change-Id: Ibd8b42a6d045d73e3ea29015aa6ccbbde278eec7 Reviewed-on: http://gerrit.cloudera.org:8080/15687 Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
This commit is contained in:
committed by
Impala Public Jenkins
parent
5e69ae1d7d
commit
5989900ae8
@@ -44,7 +44,8 @@
|
||||
|
||||
set -eu -o pipefail
|
||||
|
||||
: ${IMPALA_HOME:=~/Impala}
|
||||
: ${IMPALA_HOME:=$(cd "$(dirname $0)"/..; pwd)}
|
||||
export IMPALA_HOME
|
||||
|
||||
if [[ -t 1 ]] # if on an interactive terminal
|
||||
then
|
||||
|
||||
@@ -366,6 +366,7 @@ export EXTERNAL_LISTEN_HOST="${EXTERNAL_LISTEN_HOST-0.0.0.0}"
|
||||
export DEFAULT_FS="${DEFAULT_FS-hdfs://${INTERNAL_LISTEN_HOST}:20500}"
|
||||
export WAREHOUSE_LOCATION_PREFIX="${WAREHOUSE_LOCATION_PREFIX-}"
|
||||
export LOCAL_FS="file:${WAREHOUSE_LOCATION_PREFIX}"
|
||||
export IMPALA_CLUSTER_NODES_DIR="${IMPALA_CLUSTER_NODES_DIR-$IMPALA_HOME/testdata/cluster/cdh$CDH_MAJOR_VERSION}"
|
||||
|
||||
ESCAPED_IMPALA_HOME=$(sed "s/[^0-9a-zA-Z]/_/g" <<< "$IMPALA_HOME")
|
||||
if $USE_CDP_HIVE; then
|
||||
@@ -612,7 +613,6 @@ HADOOP_CLASSPATH="$LZO_JAR_PATH"
|
||||
# minicluster.
|
||||
HADOOP_CLASSPATH="${HADOOP_CLASSPATH}:${HADOOP_HOME}/share/hadoop/tools/lib/*"
|
||||
|
||||
export MINI_DFS_BASE_DATA_DIR="$IMPALA_HOME/cdh-${CDH_MAJOR_VERSION}-hdfs-data"
|
||||
export PATH="$HADOOP_HOME/bin:$PATH"
|
||||
|
||||
export SENTRY_HOME="$CDH_COMPONENTS_HOME/sentry-${IMPALA_SENTRY_VERSION}"
|
||||
@@ -802,7 +802,7 @@ echo "HADOOP_HOME = $HADOOP_HOME"
|
||||
echo "HADOOP_CONF_DIR = $HADOOP_CONF_DIR"
|
||||
echo "HADOOP_INCLUDE_DIR = $HADOOP_INCLUDE_DIR"
|
||||
echo "HADOOP_LIB_DIR = $HADOOP_LIB_DIR"
|
||||
echo "MINI_DFS_BASE_DATA_DIR = $MINI_DFS_BASE_DATA_DIR"
|
||||
echo "IMPALA_CLUSTER_NODES_DIR= $IMPALA_CLUSTER_NODES_DIR"
|
||||
echo "HIVE_HOME = $HIVE_HOME"
|
||||
echo "HIVE_CONF_DIR = $HIVE_CONF_DIR"
|
||||
echo "HIVE_SRC_DIR = $HIVE_SRC_DIR"
|
||||
|
||||
@@ -18,6 +18,10 @@
|
||||
# under the License.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
: ${IMPALA_HOME:=$(cd "$(dirname $0)"; pwd)}
|
||||
export IMPALA_HOME
|
||||
|
||||
. $IMPALA_HOME/bin/report_build_error.sh
|
||||
setup_report_build_error
|
||||
|
||||
|
||||
105
testdata/bin/cache_tables.py
vendored
105
testdata/bin/cache_tables.py
vendored
@@ -1,105 +0,0 @@
|
||||
#!/usr/bin/env impala-python
|
||||
##############################################################################
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
##############################################################################
|
||||
#
|
||||
# This script will warm up the buffer cache with the tables required to run the input
|
||||
# query. This only works on a mini-dfs cluster. This is remarkably difficult to do
|
||||
# since hdfs which tries to hide the details of the block locations from users.
|
||||
# The only way to do this is to
|
||||
# 1. use the java APIs (deprecated, of course) to extract the block ids.
|
||||
# 2. find the files with those block ids on the file system and read them
|
||||
#
|
||||
# First run testdata/bin/generate-block-ids.sh. This will output the block locations
|
||||
# to testdata/block-ids. This file is good as long as the mini-dfs cluster does not
|
||||
# get new files. If the block-ids file is not there, this script will run
|
||||
# generate-block-ids.sh.
|
||||
#
|
||||
# Run this script, passing it the query and it will go read every replica of every
|
||||
# block of every table in the query.
|
||||
import math
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import subprocess
|
||||
import tempfile
|
||||
from optparse import OptionParser
|
||||
|
||||
# Options
|
||||
parser = OptionParser()
|
||||
parser.add_option("-q", "--query", dest="query", default = "",
|
||||
help="Query to run. If none specified, runs all queries.")
|
||||
|
||||
(options, args) = parser.parse_args()
|
||||
|
||||
block_ids_file = 'testdata/block-ids'
|
||||
data_node_root = os.environ['MINI_DFS_BASE_DATA_DIR'] + '/dfs/data'
|
||||
block_ids = {}
|
||||
|
||||
# Parse the block ids file to all the block ids for all the tables
|
||||
# the format of the file is:
|
||||
# <table name>: <block_id1> <block_id2> <etc>
|
||||
def parse_block_ids():
|
||||
full_path = os.environ['IMPALA_HOME'] + "/" + block_ids_file;
|
||||
if not os.path.isfile(full_path):
|
||||
cmd = os.environ['IMPALA_HOME'] + '/testdata/bin/generate-block-ids.sh'
|
||||
os.system(cmd)
|
||||
|
||||
if not os.path.isfile(full_path):
|
||||
raise Exception("Could not find/generate block id files: " + full_path)
|
||||
|
||||
f = open(full_path);
|
||||
for line in f:
|
||||
tokens = line.split(':')
|
||||
blocks = tokens[1].strip().split(' ')
|
||||
block_ids[tokens[0].strip()] = blocks
|
||||
|
||||
# Parse for the tables used in this query
|
||||
def parse_tables(query):
|
||||
table_predecessor = ['from', 'join']
|
||||
tokens = query.split(' ')
|
||||
tables = []
|
||||
next_is_table = False
|
||||
for t in tokens:
|
||||
t = t.lower()
|
||||
if next_is_table:
|
||||
tables.append(t)
|
||||
next_is_table = False
|
||||
if t in table_predecessor:
|
||||
next_is_table = True
|
||||
return tables
|
||||
|
||||
# Warm the buffer cache by cat-ing all the blocks to /dev/null
|
||||
def warm_buffer_cache(table):
|
||||
if table not in block_ids:
|
||||
raise Exception("Table not found: " + table)
|
||||
|
||||
blocks = block_ids[table]
|
||||
for block in blocks:
|
||||
cmd = 'find %s -type f -name blk_%s* -exec cat {} > /dev/null \;' % \
|
||||
(data_node_root, block)
|
||||
os.system(cmd)
|
||||
|
||||
tables = parse_tables(options.query)
|
||||
parse_block_ids()
|
||||
|
||||
if len(tables) == 0:
|
||||
raise Exception("Could not parse tables in: " + options.query)
|
||||
|
||||
for table in tables:
|
||||
warm_buffer_cache(table)
|
||||
23
testdata/cluster/admin
vendored
23
testdata/cluster/admin
vendored
@@ -48,7 +48,6 @@ done
|
||||
shift $(($OPTIND-1))
|
||||
|
||||
DIR=$(dirname $0)
|
||||
NODES_DIR="$DIR/cdh$CDH_MAJOR_VERSION"
|
||||
NODE_COUNT=3
|
||||
if [[ "$TARGET_FILESYSTEM" == "hdfs" && "$ERASURE_CODING" = true ]]; then
|
||||
NODE_COUNT=5
|
||||
@@ -191,13 +190,13 @@ function is_kerberized {
|
||||
|
||||
function cluster_exists {
|
||||
# Just use the first node as an indicator...
|
||||
if [[ ! -e "$NODES_DIR/${NODE_PREFIX}1" ]]; then
|
||||
if [[ ! -e "$IMPALA_CLUSTER_NODES_DIR/${NODE_PREFIX}1" ]]; then
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
function create_cluster {
|
||||
mkdir -p "$NODES_DIR"
|
||||
mkdir -p "$IMPALA_CLUSTER_NODES_DIR"
|
||||
|
||||
# Used to populate config templates later
|
||||
GROUP=$(id -gn)
|
||||
@@ -384,7 +383,7 @@ function exec_init_script {
|
||||
local CMD="$1"
|
||||
|
||||
local PIDS=()
|
||||
for SCRIPT in $(find "$NODES_DIR" -path "*/$NODE_PREFIX*/etc/init.d/$SCRIPT_NAME" \
|
||||
for SCRIPT in $(find "$IMPALA_CLUSTER_NODES_DIR" -path "*/$NODE_PREFIX*/etc/init.d/$SCRIPT_NAME" \
|
||||
$FIND_EXECUTABLE_FILTER -type f); do
|
||||
if "$SCRIPT" status &>/dev/null; then
|
||||
RUNNING=true
|
||||
@@ -419,7 +418,7 @@ function check_cluster_status {
|
||||
|
||||
ROLE_COUNT=0
|
||||
NOT_RUNNING=()
|
||||
for NODE_DIR in "$NODES_DIR/$NODE_PREFIX"*; do
|
||||
for NODE_DIR in "$IMPALA_CLUSTER_NODES_DIR/$NODE_PREFIX"*; do
|
||||
for SERVICE in ${SUPPORTED_SERVICES[@]-}; do
|
||||
for SCRIPT in $(find "$NODE_DIR" -path "*/etc/init.d/$SERVICE*" $FIND_EXECUTABLE_FILTER \
|
||||
-type f); do
|
||||
@@ -472,30 +471,30 @@ function restart {
|
||||
|
||||
function delete_data {
|
||||
# Delete namenode, datanode and KMS data while preserving directory structure.
|
||||
rm -rf "$NODES_DIR/$NODE_PREFIX"*/data/dfs/{nn,dn}/*
|
||||
rm -f "$NODES_DIR/$NODE_PREFIX"*/data/kms.keystore
|
||||
rm -rf "$IMPALA_CLUSTER_NODES_DIR/$NODE_PREFIX"*/data/dfs/{nn,dn}/*
|
||||
rm -f "$IMPALA_CLUSTER_NODES_DIR/$NODE_PREFIX"*/data/kms.keystore
|
||||
delete_kudu_data
|
||||
}
|
||||
|
||||
function delete_kudu_data {
|
||||
rm -rf "$NODES_DIR/$NODE_PREFIX"*/var/lib/kudu/{master,ts}/*
|
||||
rm -rf "$IMPALA_CLUSTER_NODES_DIR/$NODE_PREFIX"*/var/lib/kudu/{master,ts}/*
|
||||
}
|
||||
|
||||
function delete_cluster {
|
||||
pkill -u $USER -f $KILL_CLUSTER_MARKER || true
|
||||
rm -rf "$NODES_DIR"
|
||||
rm -rf "$IMPALA_CLUSTER_NODES_DIR"
|
||||
}
|
||||
|
||||
function get_node_dir {
|
||||
if $IS_OSX; then
|
||||
greadlink -f "$NODES_DIR/$1"
|
||||
greadlink -f "$IMPALA_CLUSTER_NODES_DIR/$1"
|
||||
else
|
||||
readlink -f "$NODES_DIR/$1"
|
||||
readlink -f "$IMPALA_CLUSTER_NODES_DIR/$1"
|
||||
fi
|
||||
}
|
||||
|
||||
function get_hadoop_client_conf_dir {
|
||||
echo "$NODES_DIR/$NODE_PREFIX"1/etc/hadoop/conf
|
||||
echo "$IMPALA_CLUSTER_NODES_DIR/$NODE_PREFIX"1/etc/hadoop/conf
|
||||
}
|
||||
|
||||
COMMAND=$1
|
||||
|
||||
@@ -209,9 +209,8 @@ class MiniCluster(Cluster):
|
||||
shutil.copy(os.path.join(other_conf_dir, file_name), self._local_hadoop_conf_dir)
|
||||
|
||||
def _get_node_conf_dir(self):
|
||||
return os.path.join(os.environ["IMPALA_HOME"], "testdata", "cluster",
|
||||
"cdh%s" % os.environ["CDH_MAJOR_VERSION"], "node-1",
|
||||
"etc", "hadoop", "conf")
|
||||
return os.path.join(os.environ["IMPALA_CLUSTER_NODES_DIR"],
|
||||
"node-1", "etc", "hadoop", "conf")
|
||||
|
||||
def _get_other_conf_dir(self):
|
||||
return os.path.join(os.environ["IMPALA_HOME"], "fe", "src", "test",
|
||||
|
||||
Reference in New Issue
Block a user