mirror of
https://github.com/apache/impala.git
synced 2025-12-30 21:02:41 -05:00
Syntax is "CREATE TABLE name LIKE fileformat '/path/to/file'". Supports all options that CREATE TABLE does. Currently only PARQUET is supported. Run testdata/bin/create-load-data.sh after pulling this patch. Change-Id: Ibb9fbb89dbde6acceb850b914c48d12f22b33f55 Reviewed-on: http://gerrit.ent.cloudera.com:8080/2720 Reviewed-by: Victor Bittorf <victor.bittorf@cloudera.com> Tested-by: jenkins Reviewed-on: http://gerrit.ent.cloudera.com:8080/3158
210 lines
8.7 KiB
Bash
Executable File
210 lines
8.7 KiB
Bash
Executable File
#!/bin/bash
|
|
# Copyright 2012 Cloudera Inc.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
# This script can be executed in two ways:
|
|
# 1) Without any command line parameters - A normal data load will happen where data is
|
|
# generated as needed, generally by issuing 'INSERT INTO <table> SELECT *' commands.
|
|
# 2) With a command line parameter pointing to a test-warehouse snapshot file - In this
|
|
# case the snapshot file contents will be copied into HDFS prior to calling the data load
|
|
# scripts. This speeds up overall data loading time because it usually means only the
|
|
# table metadata needs to be created.
|
|
#
|
|
# For more information look at testdata/bin/load-test-warehouse-snapshot.sh and
|
|
# bin/load-data.py
|
|
|
|
. ${IMPALA_HOME}/bin/impala-config.sh
|
|
set -ex
|
|
|
|
# Setup for HDFS caching
|
|
${IMPALA_HOME}/testdata/bin/setup-hdfs-caching.sh
|
|
|
|
# If the user has specified a command line argument, treat it as the test-warehouse
|
|
# snapshot file and pass it to the load-test-warehouse-snapshot.sh script for processing.
|
|
if [[ $1 ]]; then
|
|
${IMPALA_HOME}/testdata/bin/load-test-warehouse-snapshot.sh "$1"
|
|
else
|
|
echo "Loading hive builtins"
|
|
${IMPALA_HOME}/testdata/bin/load-hive-builtins.sh
|
|
|
|
echo "Generating HBase data"
|
|
${IMPALA_HOME}/testdata/bin/create-hbase.sh
|
|
fi
|
|
set -u
|
|
|
|
IMPALAD_LOG_DIR=${IMPALA_TEST_CLUSTER_LOG_DIR}/data_loading
|
|
mkdir -p ${IMPALAD_LOG_DIR}
|
|
|
|
# Copy the test data source library into HDFS
|
|
${IMPALA_HOME}/testdata/bin/copy-data-sources.sh
|
|
|
|
# If a schema change is detected, force load the data.
|
|
set +e
|
|
LOAD_DATA_ARGS=""
|
|
${IMPALA_HOME}/testdata/bin/check-schema-diff.sh
|
|
if [[ $? -eq 1 ]]; then
|
|
LOAD_DATA_ARGS="--force"
|
|
fi
|
|
set -e
|
|
|
|
# Load schemas
|
|
hadoop fs -rm -r -f /test-warehouse/schemas
|
|
hadoop fs -mkdir /test-warehouse/schemas
|
|
hadoop fs -put ${IMPALA_HOME}/testdata/data/schemas/zipcode_incomes.parquet \
|
|
/test-warehouse/schemas/
|
|
hadoop fs -put ${IMPALA_HOME}/testdata/data/schemas/unsupported.parquet \
|
|
/test-warehouse/schemas/
|
|
hadoop fs -put ${IMPALA_HOME}/testdata/data/schemas/map.parquet \
|
|
/test-warehouse/schemas/
|
|
hadoop fs -put ${IMPALA_HOME}/testdata/data/schemas/array.parquet \
|
|
/test-warehouse/schemas/
|
|
hadoop fs -put ${IMPALA_HOME}/testdata/data/schemas/struct.parquet \
|
|
/test-warehouse/schemas/
|
|
hadoop fs -put ${IMPALA_HOME}/testdata/data/schemas/alltypestiny.parquet \
|
|
/test-warehouse/schemas/
|
|
hadoop fs -put ${IMPALA_HOME}/testdata/data/schemas/malformed_decimal_tiny.parquet \
|
|
/test-warehouse/schemas/
|
|
hadoop fs -put ${IMPALA_HOME}/testdata/data/schemas/decimal.parquet \
|
|
/test-warehouse/schemas/
|
|
|
|
# Load the data set
|
|
pushd ${IMPALA_HOME}/bin
|
|
./start-impala-cluster.py -s 3 --wait_for_cluster --log_dir=${IMPALAD_LOG_DIR}
|
|
# Use unbuffered logging by executing these data loading steps with 'python -u'
|
|
python -u ./load-data.py --workloads functional-query --exploration_strategy exhaustive \
|
|
${LOAD_DATA_ARGS}
|
|
python -u ./load-data.py --workloads tpcds --exploration_strategy core ${LOAD_DATA_ARGS}
|
|
python -u ./load-data.py --workloads tpch --exploration_strategy core ${LOAD_DATA_ARGS}
|
|
|
|
# Cache test tables
|
|
./impala-shell.sh -q "alter table tpch.nation set cached in 'testPool'"
|
|
./impala-shell.sh -q "alter table functional.alltypestiny set cached in 'testPool'"
|
|
|
|
# Load the test data source and table
|
|
./impala-shell.sh -f ${IMPALA_HOME}/testdata/bin/create-data-source-table.sql
|
|
# Load all the auxiliary workloads (if any exist)
|
|
if [ -d ${IMPALA_AUX_WORKLOAD_DIR} ] && [ -d ${IMPALA_AUX_DATASET_DIR} ]; then
|
|
python -u ./load-data.py --workloads all --workload_dir=${IMPALA_AUX_WORKLOAD_DIR}\
|
|
--dataset_dir=${IMPALA_AUX_DATASET_DIR} --exploration_strategy core
|
|
else
|
|
echo "Skipping load of auxilary workloads because directories do not exist"
|
|
fi
|
|
popd
|
|
|
|
# Create a table w/ 1234 partitions. Used to validate fetching/updating partitions in
|
|
# batches.
|
|
${IMPALA_HOME}/testdata/bin/create-table-many-blocks.sh -p 1234 -b 1
|
|
|
|
# Split HBase table
|
|
echo "Splitting HBase table"
|
|
${IMPALA_HOME}/testdata/bin/split-hbase.sh
|
|
|
|
echo COPYING AUTHORIZATION POLICY FILE
|
|
hadoop fs -rm -f /test-warehouse/authz-policy.ini
|
|
hadoop fs -put ${IMPALA_HOME}/fe/src/test/resources/authz-policy.ini /test-warehouse/
|
|
|
|
# TODO: The multi-format table will move these files. So we need to copy them to a
|
|
# temporary location for that table to use. Should find a better way to handle this.
|
|
echo COPYING DATA FOR DEPENDENT TABLES
|
|
hadoop fs -rm -r -f /test-warehouse/alltypesmixedformat
|
|
hadoop fs -rm -r -f /tmp/alltypes_rc
|
|
hadoop fs -rm -r -f /tmp/alltypes_seq
|
|
hadoop fs -mkdir -p /tmp/alltypes_seq/year=2009
|
|
hadoop fs -mkdir -p /tmp/alltypes_rc/year=2009
|
|
hadoop fs -cp /test-warehouse/alltypes_seq/year=2009/month=2/ /tmp/alltypes_seq/year=2009
|
|
hadoop fs -cp /test-warehouse/alltypes_rc/year=2009/month=3/ /tmp/alltypes_rc/year=2009
|
|
|
|
# Create a hidden file in AllTypesSmall
|
|
hadoop fs -rm -f /test-warehouse/alltypessmall/year=2009/month=1/_hidden
|
|
hadoop fs -rm -f /test-warehouse/alltypessmall/year=2009/month=1/.hidden
|
|
hadoop fs -cp /test-warehouse/zipcode_incomes/DEC_00_SF3_P077_with_ann_noheader.csv \
|
|
/test-warehouse/alltypessmall/year=2009/month=1/_hidden
|
|
hadoop fs -cp /test-warehouse/zipcode_incomes/DEC_00_SF3_P077_with_ann_noheader.csv \
|
|
/test-warehouse/alltypessmall/year=2009/month=1/.hidden
|
|
|
|
# Configure alltypes_seq as a read-only table
|
|
hadoop fs -chmod -R 444 /test-warehouse/alltypes_seq/year=2009/month=1
|
|
hadoop fs -chmod -R 444 /test-warehouse/alltypes_seq/year=2009/month=3
|
|
|
|
# TODO: For some reason DROP TABLE IF EXISTS sometimes fails on HBase if the table does
|
|
# not exist. To work around this, disable exit on error before executing this command.
|
|
# Need to investigate this more, but this works around the problem to unblock automation.
|
|
set +o errexit
|
|
${HIVE_HOME}/bin/hive -hiveconf hive.root.logger=WARN,console -v \
|
|
-e "DROP TABLE IF EXISTS functional_hbase.internal_hbase_table"
|
|
echo "disable 'functional_hbase.internal_hbase_table'" | hbase shell
|
|
echo "drop 'functional_hbase.internal_hbase_table'" | hbase shell
|
|
set -e
|
|
|
|
# For tables that rely on loading data from local fs test-warehouse
|
|
# TODO: Find a good way to integrate this with the normal data loading scripts
|
|
${HIVE_HOME}/bin/hive -hiveconf hive.root.logger=WARN,console -v \
|
|
-f ${IMPALA_HOME}/testdata/bin/load-dependent-tables.sql
|
|
if [ $? != 0 ]; then
|
|
echo DEPENDENT LOAD FAILED
|
|
exit 1
|
|
fi
|
|
|
|
# Load the index files for corrupted lzo data.
|
|
hadoop fs -rm -f /test-warehouse/bad_text_lzo_text_lzo/bad_text.lzo.index
|
|
hadoop fs -put ${IMPALA_HOME}/testdata/bad_text_lzo/bad_text.lzo.index \
|
|
/test-warehouse/bad_text_lzo_text_lzo/
|
|
|
|
hadoop fs -rm -r -f /bad_text_lzo_text_lzo/
|
|
hadoop fs -mv /test-warehouse/bad_text_lzo_text_lzo/ /
|
|
# Cleanup the old bad_text_lzo files, if they exist.
|
|
hadoop fs -rm -r -f /test-warehouse/bad_text_lzo/
|
|
|
|
# Index all lzo files in HDFS under /test-warehouse
|
|
${IMPALA_HOME}/testdata/bin/lzo_indexer.sh /test-warehouse
|
|
|
|
hadoop fs -mv /bad_text_lzo_text_lzo/ /test-warehouse/
|
|
|
|
# IMPALA-694: data file produced by parquet-mr version 1.2.5-cdh4.5.0
|
|
hadoop fs -put -f ${IMPALA_HOME}/testdata/data/bad_parquet_data.parquet \
|
|
/test-warehouse/bad_parquet_parquet
|
|
|
|
# Data file produced by parquet-mr with repeated values (produces 0 bit width dictionary)
|
|
hadoop fs -put -f ${IMPALA_HOME}/testdata/data/repeated_values.parquet \
|
|
/test-warehouse/bad_parquet_parquet
|
|
|
|
# IMPALA-720: data file produced by parquet-mr with multiple row groups
|
|
hadoop fs -put -f ${IMPALA_HOME}/testdata/data/multiple_rowgroups.parquet \
|
|
/test-warehouse/bad_parquet_parquet
|
|
|
|
# Remove an index file so we test an un-indexed LZO file
|
|
hadoop fs -rm /test-warehouse/alltypes_text_lzo/year=2009/month=1/000013_0.lzo.index
|
|
|
|
# Add a sequence file that only contains a header (see IMPALA-362)
|
|
hadoop fs -put -f ${IMPALA_HOME}/testdata/tinytable_seq_snap/tinytable_seq_snap_header_only \
|
|
/test-warehouse/tinytable_seq_snap
|
|
|
|
# Create special table for testing Avro schema resolution
|
|
# (see testdata/avro_schema_resolution/README)
|
|
pushd ${IMPALA_HOME}/testdata/avro_schema_resolution
|
|
hive -f create_table.sql
|
|
popd
|
|
|
|
${IMPALA_HOME}/testdata/bin/compute-table-stats.sh
|
|
|
|
# Build the test Hive UDFs
|
|
pushd ${IMPALA_HOME}/tests/test-hive-udfs
|
|
mvn clean package
|
|
popd
|
|
|
|
# Copy the test UDF/UDA libraries into HDFS
|
|
${IMPALA_HOME}/testdata/bin/copy-udfs-udas.sh
|
|
|
|
${IMPALA_HOME}/bin/start-impala-cluster.py --kill_only
|