Files
impala/testdata/bin/create-load-data.sh
Lenni Kuff bf16b5cd0d IMPALA-749: Fetch partitions in batches, rather than all at once.
This updates how Impala fetches partition metadata from the Hive Metastore to fetch
partitions in batches, rather than all at once. This helps reduce the load on the
HMS and also lets Impala scale to above 32K partitions. The downside is that it
may require additional RPCs to get all the partitions.

This is done by first querying the metastore to get all the partition names that
exist, then splitting the list of names into seperate batches to get the actual
partition metadata.

Impala uses a default size of 1000 partitions per batch, but it can be configured
by setting the 'hive.metastore.batch.retrieve.table.partition.max' parameter
in the hive-site.xml config file.

Change-Id: Ide0ec30ef8a9e00f79c26551aa8e5e7814c73034
Reviewed-on: http://gerrit.ent.cloudera.com:8080/1662
Reviewed-by: Lenni Kuff <lskuff@cloudera.com>
Tested-by: jenkins
Reviewed-on: http://gerrit.ent.cloudera.com:8080/1698
2014-02-28 22:30:45 -08:00

162 lines
6.7 KiB
Bash
Executable File

#!/bin/bash
# Copyright (c) 2012 Cloudera, Inc. All rights reserved.
#
# This script can be executed in two ways:
# 1) Without any command line parameters - A normal data load will happen where data is
# generated as needed, generally by issuing 'INSERT INTO <table> SELECT *' commands.
# 2) With a command line parameter pointing to a test-warehouse snapshot file - In this
# case the snapshot file contents will be copied into HDFS prior to calling the data load
# scripts. This speeds up overall data loading time because it usually means only the
# table metadata needs to be created.
#
# For more information look at testdata/bin/load-test-warehouse-snapshot.sh and
# bin/load-data.py
if [ x${JAVA_HOME} == x ]; then
echo JAVA_HOME not set
exit 1
fi
. ${IMPALA_HOME}/bin/impala-config.sh
set -e
# If the user has specified a command line argument, treat it as the test-warehouse
# snapshot file and pass it to the load-test-warehouse-snapshot.sh script for processing.
if [[ $1 ]]; then
${IMPALA_HOME}/testdata/bin/load-test-warehouse-snapshot.sh "$1"
else
echo "Loading hive builtins"
${IMPALA_HOME}/testdata/bin/load-hive-builtins.sh
echo "Generating HBase data"
${IMPALA_HOME}/testdata/bin/create-hbase.sh
fi
set -u
IMPALAD_LOG_DIR=${IMPALA_TEST_CLUSTER_LOG_DIR}/data_loading
mkdir -p ${IMPALAD_LOG_DIR}
# Load the data set
pushd ${IMPALA_HOME}/bin
./start-impala-cluster.py -s 3 --wait_for_cluster --log_dir=${IMPALAD_LOG_DIR}
# Use unbuffered logging by executing these data loading steps with 'python -u'
python -u ./load-data.py --workloads functional-query --exploration_strategy exhaustive
python -u ./load-data.py --workloads tpcds --exploration_strategy core
python -u ./load-data.py --workloads tpch --exploration_strategy core
# Load all the auxiliary workloads (if any exist)
if [ -d ${IMPALA_AUX_WORKLOAD_DIR} ] && [ -d ${IMPALA_AUX_DATASET_DIR} ]; then
python -u ./load-data.py --workloads all --workload_dir=${IMPALA_AUX_WORKLOAD_DIR}\
--dataset_dir=${IMPALA_AUX_DATASET_DIR} --exploration_strategy core
else
echo "Skipping load of auxilary workloads because directories do not exist"
fi
popd
# Create a table w/ 1234 partitions that is empty. Used to validate fetching partitions
# in batches.
${IMPALA_HOME}/testdata/bin/create-table-many-blocks.sh -p 1234 -b 0
# Split HBase table
echo "Splitting HBase table"
${IMPALA_HOME}/testdata/bin/split-hbase.sh
echo COPYING AUTHORIZATION POLICY FILE
hadoop fs -rm -f /test-warehouse/authz-policy.ini
hadoop fs -put ${IMPALA_HOME}/fe/src/test/resources/authz-policy.ini /test-warehouse/
# TODO: The multi-format table will move these files. So we need to copy them to a
# temporary location for that table to use. Should find a better way to handle this.
echo COPYING DATA FOR DEPENDENT TABLES
hadoop fs -rm -r -f /test-warehouse/alltypesmixedformat
hadoop fs -rm -r -f /tmp/alltypes_rc
hadoop fs -rm -r -f /tmp/alltypes_seq
hadoop fs -mkdir -p /tmp/alltypes_seq/year=2009
hadoop fs -mkdir -p /tmp/alltypes_rc/year=2009
hadoop fs -cp /test-warehouse/alltypes_seq/year=2009/month=2/ /tmp/alltypes_seq/year=2009
hadoop fs -cp /test-warehouse/alltypes_rc/year=2009/month=3/ /tmp/alltypes_rc/year=2009
# Create a hidden file in AllTypesSmall
hadoop fs -rm -f /test-warehouse/alltypessmall/year=2009/month=1/_hidden
hadoop fs -rm -f /test-warehouse/alltypessmall/year=2009/month=1/.hidden
hadoop fs -cp /test-warehouse/zipcode_incomes/DEC_00_SF3_P077_with_ann_noheader.csv \
/test-warehouse/alltypessmall/year=2009/month=1/_hidden
hadoop fs -cp /test-warehouse/zipcode_incomes/DEC_00_SF3_P077_with_ann_noheader.csv \
/test-warehouse/alltypessmall/year=2009/month=1/.hidden
# Configure alltypes_seq as a read-only table
hadoop fs -chmod -R 444 /test-warehouse/alltypes_seq/year=2009/month=1
hadoop fs -chmod -R 444 /test-warehouse/alltypes_seq/year=2009/month=3
# TODO: For some reason DROP TABLE IF EXISTS sometimes fails on HBase if the table does
# not exist. To work around this, disable exit on error before executing this command.
# Need to investigate this more, but this works around the problem to unblock automation.
set +o errexit
${HIVE_HOME}/bin/hive -hiveconf hive.root.logger=WARN,console -v \
-e "DROP TABLE IF EXISTS functional_hbase.internal_hbase_table"
echo "disable 'functional_hbase.internal_hbase_table'" | hbase shell
echo "drop 'functional_hbase.internal_hbase_table'" | hbase shell
set -e
# For tables that rely on loading data from local fs test-warehouse
# TODO: Find a good way to integrate this with the normal data loading scripts
${HIVE_HOME}/bin/hive -hiveconf hive.root.logger=WARN,console -v \
-f ${IMPALA_HOME}/testdata/bin/load-dependent-tables.sql
if [ $? != 0 ]; then
echo DEPENDENT LOAD FAILED
exit 1
fi
# Load the index files for corrupted lzo data.
hadoop fs -rm -f /test-warehouse/bad_text_lzo_text_lzo/bad_text.lzo.index
hadoop fs -put ${IMPALA_HOME}/testdata/bad_text_lzo/bad_text.lzo.index \
/test-warehouse/bad_text_lzo_text_lzo/
hadoop fs -rm -r -f /bad_text_lzo_text_lzo/
hadoop fs -mv /test-warehouse/bad_text_lzo_text_lzo/ /
# Cleanup the old bad_text_lzo files, if they exist.
hadoop fs -rm -r -f /test-warehouse/bad_text_lzo/
# Index all lzo files in HDFS under /test-warehouse
${IMPALA_HOME}/testdata/bin/lzo_indexer.sh /test-warehouse
hadoop fs -mv /bad_text_lzo_text_lzo/ /test-warehouse/
# IMPALA-694: data file produced by parquet-mr version 1.2.5-cdh4.5.0
hadoop fs -put -f ${IMPALA_HOME}/testdata/data/bad_parquet_data.parquet \
/test-warehouse/bad_parquet_parquet
# Data file produced by parquet-mr with repeated values (produces 0 bit width dictionary)
hadoop fs -put -f ${IMPALA_HOME}/testdata/data/repeated_values.parquet \
/test-warehouse/bad_parquet_parquet
# IMPALA-720: data file produced by parquet-mr with multiple row groups
hadoop fs -put -f ${IMPALA_HOME}/testdata/data/multiple_rowgroups.parquet \
/test-warehouse/bad_parquet_parquet
# Remove an index file so we test an un-indexed LZO file
hadoop fs -rm /test-warehouse/alltypes_text_lzo/year=2009/month=1/000013_0.lzo.index
# Add a sequence file that only contains a header (see IMPALA-362)
hadoop fs -put -f ${IMPALA_HOME}/testdata/tinytable_seq_snap/tinytable_seq_snap_header_only \
/test-warehouse/tinytable_seq_snap
# Create special table for testing Avro schema resolution
# (see testdata/avro_schema_resolution/README)
pushd ${IMPALA_HOME}/testdata/avro_schema_resolution
hive -f create_table.sql
popd
${IMPALA_HOME}/testdata/bin/compute-table-stats.sh
# Build the test Hive UDFs
pushd ${IMPALA_HOME}/tests/test-hive-udfs
mvn clean package
popd
# Copy the test UDF/UDA libraries into HDFS
${IMPALA_HOME}/testdata/bin/copy-udfs-udas.sh
# Setup for HDFS caching
${IMPALA_HOME}/testdata/bin/setup-hdfs-caching.sh
${IMPALA_HOME}/bin/start-impala-cluster.py --kill_only