mirror of
https://github.com/apache/impala.git
synced 2026-01-18 06:00:37 -05:00
This is the first iteration of a kerberized development environment. All the daemons start and use kerberos, with the sole exception of the hive metastore. This is sufficient to test impala authentication. When buildall.sh is run using '-kerberize', it will stop before loading data or attempting to run tests. Loading data into the cluster is known to not work at this time, the root causes being that Beeline -> HiveServer2 -> MapReduce throws errors, and Beeline -> HiveServer2 -> HBase has problems. These are left for later work. However, the impala daemons will happily authenticate using kerberos both from clients (like the impala shell) and amongst each other. This means that if you can get data into the mini-cluster, you could query it. Usage: * Supply a '-kerberize' option to buildall.sh, or * Supply a '-kerberize' option to create-test-configuration.sh, then 'run-all.sh -format', re-source impala-config.sh, and then start impala daemons as usual. You must reformat the cluster because kerberizing it will change all the ownership of all files in HDFS. Notable changes: * Added clean start/stop script for the llama-minikdc * Creation of Kerberized HDFS - namenode and datanodes * Kerberized HBase (and Zookeeper) * Kerberized Hive (minus the MetaStore) * Kerberized Impala * Loading of data very nearly working Still to go: * Kerberize the MetaStore * Get data loading working * Run all tests * The unknown unknowns * Extensive testing Change-Id: Iee3f56f6cc28303821fc6a3bf3ca7f5933632160 Reviewed-on: http://gerrit.sjc.cloudera.com:8080/4019 Reviewed-by: Michael Yoder <myoder@cloudera.com> Tested-by: jenkins
216 lines
8.9 KiB
Bash
Executable File
216 lines
8.9 KiB
Bash
Executable File
#!/bin/bash
|
|
# Copyright 2012 Cloudera Inc.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
# This script can be executed in two ways:
|
|
# 1) Without any command line parameters - A normal data load will happen where data is
|
|
# generated as needed, generally by issuing 'INSERT INTO <table> SELECT *' commands.
|
|
# 2) With a command line parameter pointing to a test-warehouse snapshot file - In this
|
|
# case the snapshot file contents will be copied into HDFS prior to calling the data load
|
|
# scripts. This speeds up overall data loading time because it usually means only the
|
|
# table metadata needs to be created.
|
|
#
|
|
# For more information look at testdata/bin/load-test-warehouse-snapshot.sh and
|
|
# bin/load-data.py
|
|
|
|
. ${IMPALA_HOME}/bin/impala-config.sh
|
|
set -ex
|
|
|
|
# Setup for HDFS caching
|
|
${IMPALA_HOME}/testdata/bin/setup-hdfs-caching.sh
|
|
|
|
# If the user has specified a command line argument, treat it as the test-warehouse
|
|
# snapshot file and pass it to the load-test-warehouse-snapshot.sh script for processing.
|
|
if [[ $1 ]]; then
|
|
${IMPALA_HOME}/testdata/bin/load-test-warehouse-snapshot.sh "$1"
|
|
else
|
|
echo "Loading hive builtins"
|
|
${IMPALA_HOME}/testdata/bin/load-hive-builtins.sh
|
|
|
|
echo "Generating HBase data"
|
|
${IMPALA_HOME}/testdata/bin/create-hbase.sh
|
|
fi
|
|
set -u
|
|
|
|
IMPALAD_LOG_DIR=${IMPALA_TEST_CLUSTER_LOG_DIR}/data_loading
|
|
mkdir -p ${IMPALAD_LOG_DIR}
|
|
|
|
# Copy the test data source library into HDFS
|
|
${IMPALA_HOME}/testdata/bin/copy-data-sources.sh
|
|
|
|
# If a schema change is detected, force load the data.
|
|
set +e
|
|
LOAD_DATA_ARGS=""
|
|
${IMPALA_HOME}/testdata/bin/check-schema-diff.sh
|
|
if [[ $? -eq 1 ]]; then
|
|
LOAD_DATA_ARGS="--force"
|
|
fi
|
|
set -e
|
|
|
|
# Load schemas
|
|
hadoop fs -rm -r -f /test-warehouse/schemas
|
|
hadoop fs -mkdir /test-warehouse/schemas
|
|
hadoop fs -put ${IMPALA_HOME}/testdata/data/schemas/zipcode_incomes.parquet \
|
|
/test-warehouse/schemas/
|
|
hadoop fs -put ${IMPALA_HOME}/testdata/data/schemas/unsupported.parquet \
|
|
/test-warehouse/schemas/
|
|
hadoop fs -put ${IMPALA_HOME}/testdata/data/schemas/map.parquet \
|
|
/test-warehouse/schemas/
|
|
hadoop fs -put ${IMPALA_HOME}/testdata/data/schemas/array.parquet \
|
|
/test-warehouse/schemas/
|
|
hadoop fs -put ${IMPALA_HOME}/testdata/data/schemas/struct.parquet \
|
|
/test-warehouse/schemas/
|
|
hadoop fs -put ${IMPALA_HOME}/testdata/data/schemas/alltypestiny.parquet \
|
|
/test-warehouse/schemas/
|
|
hadoop fs -put ${IMPALA_HOME}/testdata/data/schemas/malformed_decimal_tiny.parquet \
|
|
/test-warehouse/schemas/
|
|
hadoop fs -put ${IMPALA_HOME}/testdata/data/schemas/decimal.parquet \
|
|
/test-warehouse/schemas/
|
|
|
|
# For kerberized clusters, use kerberos
|
|
if ${CLUSTER_DIR}/admin is_kerberized; then
|
|
LOAD_DATA_ARGS="${LOAD_DATA_ARGS} --use_kerberos --principal=${MINIKDC_PRINC_HIVE}"
|
|
fi
|
|
|
|
# Load the data set
|
|
pushd ${IMPALA_HOME}/bin
|
|
./start-impala-cluster.py -s 3 --wait_for_cluster --log_dir=${IMPALAD_LOG_DIR}
|
|
# Use unbuffered logging by executing these data loading steps with 'python -u'
|
|
python -u ./load-data.py --workloads functional-query --exploration_strategy exhaustive \
|
|
${LOAD_DATA_ARGS}
|
|
python -u ./load-data.py --workloads tpcds --exploration_strategy core ${LOAD_DATA_ARGS}
|
|
python -u ./load-data.py --workloads tpch --exploration_strategy core ${LOAD_DATA_ARGS}
|
|
|
|
# Cache test tables
|
|
./impala-shell.sh -q "alter table tpch.nation set cached in 'testPool'"
|
|
./impala-shell.sh -q "alter table functional.alltypestiny set cached in 'testPool'"
|
|
|
|
# Load the test data source and table
|
|
./impala-shell.sh -f ${IMPALA_HOME}/testdata/bin/create-data-source-table.sql
|
|
# Load all the auxiliary workloads (if any exist)
|
|
if [ -d ${IMPALA_AUX_WORKLOAD_DIR} ] && [ -d ${IMPALA_AUX_DATASET_DIR} ]; then
|
|
python -u ./load-data.py --workloads all --workload_dir=${IMPALA_AUX_WORKLOAD_DIR}\
|
|
--dataset_dir=${IMPALA_AUX_DATASET_DIR} --exploration_strategy core \
|
|
${LOAD_DATA_ARGS}
|
|
else
|
|
echo "Skipping load of auxilary workloads because directories do not exist"
|
|
fi
|
|
popd
|
|
|
|
# Create a table w/ 1234 partitions. Used to validate fetching/updating partitions in
|
|
# batches.
|
|
${IMPALA_HOME}/testdata/bin/create-table-many-blocks.sh -p 1234 -b 1
|
|
|
|
# Split HBase table
|
|
echo "Splitting HBase table"
|
|
${IMPALA_HOME}/testdata/bin/split-hbase.sh
|
|
|
|
echo COPYING AUTHORIZATION POLICY FILE
|
|
hadoop fs -rm -f /test-warehouse/authz-policy.ini
|
|
hadoop fs -put ${IMPALA_HOME}/fe/src/test/resources/authz-policy.ini /test-warehouse/
|
|
|
|
# TODO: The multi-format table will move these files. So we need to copy them to a
|
|
# temporary location for that table to use. Should find a better way to handle this.
|
|
echo COPYING DATA FOR DEPENDENT TABLES
|
|
hadoop fs -rm -r -f /test-warehouse/alltypesmixedformat
|
|
hadoop fs -rm -r -f /tmp/alltypes_rc
|
|
hadoop fs -rm -r -f /tmp/alltypes_seq
|
|
hadoop fs -mkdir -p /tmp/alltypes_seq/year=2009
|
|
hadoop fs -mkdir -p /tmp/alltypes_rc/year=2009
|
|
hadoop fs -cp /test-warehouse/alltypes_seq/year=2009/month=2/ /tmp/alltypes_seq/year=2009
|
|
hadoop fs -cp /test-warehouse/alltypes_rc/year=2009/month=3/ /tmp/alltypes_rc/year=2009
|
|
|
|
# Create a hidden file in AllTypesSmall
|
|
hadoop fs -rm -f /test-warehouse/alltypessmall/year=2009/month=1/_hidden
|
|
hadoop fs -rm -f /test-warehouse/alltypessmall/year=2009/month=1/.hidden
|
|
hadoop fs -cp /test-warehouse/zipcode_incomes/DEC_00_SF3_P077_with_ann_noheader.csv \
|
|
/test-warehouse/alltypessmall/year=2009/month=1/_hidden
|
|
hadoop fs -cp /test-warehouse/zipcode_incomes/DEC_00_SF3_P077_with_ann_noheader.csv \
|
|
/test-warehouse/alltypessmall/year=2009/month=1/.hidden
|
|
|
|
# Configure alltypes_seq as a read-only table
|
|
hadoop fs -chmod -R 444 /test-warehouse/alltypes_seq/year=2009/month=1
|
|
hadoop fs -chmod -R 444 /test-warehouse/alltypes_seq/year=2009/month=3
|
|
|
|
# TODO: For some reason DROP TABLE IF EXISTS sometimes fails on HBase if the table does
|
|
# not exist. To work around this, disable exit on error before executing this command.
|
|
# Need to investigate this more, but this works around the problem to unblock automation.
|
|
set +o errexit
|
|
${HIVE_HOME}/bin/hive -hiveconf hive.root.logger=WARN,console -v \
|
|
-e "DROP TABLE IF EXISTS functional_hbase.internal_hbase_table"
|
|
echo "disable 'functional_hbase.internal_hbase_table'" | hbase shell
|
|
echo "drop 'functional_hbase.internal_hbase_table'" | hbase shell
|
|
set -e
|
|
|
|
# For tables that rely on loading data from local fs test-warehouse
|
|
# TODO: Find a good way to integrate this with the normal data loading scripts
|
|
${HIVE_HOME}/bin/hive -hiveconf hive.root.logger=WARN,console -v \
|
|
-f ${IMPALA_HOME}/testdata/bin/load-dependent-tables.sql
|
|
if [ $? != 0 ]; then
|
|
echo DEPENDENT LOAD FAILED
|
|
exit 1
|
|
fi
|
|
|
|
# Load the index files for corrupted lzo data.
|
|
hadoop fs -rm -f /test-warehouse/bad_text_lzo_text_lzo/bad_text.lzo.index
|
|
hadoop fs -put ${IMPALA_HOME}/testdata/bad_text_lzo/bad_text.lzo.index \
|
|
/test-warehouse/bad_text_lzo_text_lzo/
|
|
|
|
hadoop fs -rm -r -f /bad_text_lzo_text_lzo/
|
|
hadoop fs -mv /test-warehouse/bad_text_lzo_text_lzo/ /
|
|
# Cleanup the old bad_text_lzo files, if they exist.
|
|
hadoop fs -rm -r -f /test-warehouse/bad_text_lzo/
|
|
|
|
# Index all lzo files in HDFS under /test-warehouse
|
|
${IMPALA_HOME}/testdata/bin/lzo_indexer.sh /test-warehouse
|
|
|
|
hadoop fs -mv /bad_text_lzo_text_lzo/ /test-warehouse/
|
|
|
|
# IMPALA-694: data file produced by parquet-mr version 1.2.5-cdh4.5.0
|
|
hadoop fs -put -f ${IMPALA_HOME}/testdata/data/bad_parquet_data.parquet \
|
|
/test-warehouse/bad_parquet_parquet
|
|
|
|
# Data file produced by parquet-mr with repeated values (produces 0 bit width dictionary)
|
|
hadoop fs -put -f ${IMPALA_HOME}/testdata/data/repeated_values.parquet \
|
|
/test-warehouse/bad_parquet_parquet
|
|
|
|
# IMPALA-720: data file produced by parquet-mr with multiple row groups
|
|
hadoop fs -put -f ${IMPALA_HOME}/testdata/data/multiple_rowgroups.parquet \
|
|
/test-warehouse/bad_parquet_parquet
|
|
|
|
# Remove an index file so we test an un-indexed LZO file
|
|
hadoop fs -rm /test-warehouse/alltypes_text_lzo/year=2009/month=1/000000_0.lzo.index
|
|
|
|
# Add a sequence file that only contains a header (see IMPALA-362)
|
|
hadoop fs -put -f ${IMPALA_HOME}/testdata/tinytable_seq_snap/tinytable_seq_snap_header_only \
|
|
/test-warehouse/tinytable_seq_snap
|
|
|
|
# Create special table for testing Avro schema resolution
|
|
# (see testdata/avro_schema_resolution/README)
|
|
pushd ${IMPALA_HOME}/testdata/avro_schema_resolution
|
|
hive -f create_table.sql
|
|
popd
|
|
|
|
${IMPALA_HOME}/testdata/bin/compute-table-stats.sh
|
|
|
|
# Build the test Hive UDFs
|
|
pushd ${IMPALA_HOME}/tests/test-hive-udfs
|
|
mvn clean package
|
|
popd
|
|
|
|
# Copy the test UDF/UDA libraries into HDFS
|
|
${IMPALA_HOME}/testdata/bin/copy-udfs-udas.sh
|
|
|
|
${IMPALA_HOME}/bin/start-impala-cluster.py --kill_only
|