diff --git a/bin/build_thirdparty.sh b/bin/build_thirdparty.sh index 77fa2834f..ea26d96bd 100755 --- a/bin/build_thirdparty.sh +++ b/bin/build_thirdparty.sh @@ -162,7 +162,7 @@ if [ $BUILD_ALL -eq 1 ] || [ $BUILD_GFLAGS -eq 1 ]; then build_preamble $IMPALA_HOME/thirdparty/gflags-${IMPALA_GFLAGS_VERSION} GFlags GFLAGS_INSTALL=`pwd`/third-party-install ./configure --with-pic --prefix=${GFLAGS_INSTALL} - make -j4 install + make -j${IMPALA_BUILD_THREADS:-4} install fi # Build pprof @@ -173,7 +173,7 @@ if [ $BUILD_ALL -eq 1 ] || [ $BUILD_PPROF -eq 1 ]; then # we're not compiling the rest of our code to not omit frame pointers but it # still seems to generate useful profiling data. ./configure --enable-frame-pointers --with-pic - make -j4 + make -j${IMPALA_BUILD_THREADS:-4} fi # Build glog @@ -189,14 +189,14 @@ logging_unittest-logging_unittest.o : CXXFLAGS= -gstabs -O2 EOF cat Makefile >> Makefile.gcc45sles_workaround mv Makefile.gcc45sles_workaround Makefile - make -j4 + make -j${IMPALA_BUILD_THREADS:-4} fi # Build gtest if [ $BUILD_ALL -eq 1 ] || [ $BUILD_GTEST -eq 1 ]; then build_preamble $IMPALA_HOME/thirdparty/gtest-${IMPALA_GTEST_VERSION} GTest cmake . - make -j4 + make -j${IMPALA_BUILD_THREADS:-4} fi # Build Snappy @@ -217,15 +217,15 @@ fi # Build re2 if [ $BUILD_ALL -eq 1 ] || [ $BUILD_RE2 -eq 1 ]; then build_preamble $IMPALA_HOME/thirdparty/re2 RE2 - make -j4 + make -j${IMPALA_BUILD_THREADS:-4} fi # Build Ldap if [ $BUILD_ALL -eq 1 ] || [ $BUILD_LDAP -eq 1 ]; then build_preamble $IMPALA_HOME/thirdparty/openldap-${IMPALA_OPENLDAP_VERSION} Openldap ./configure --enable-slapd=no --prefix=`pwd`/impala_install --enable-static --with-pic - make -j4 - make -j4 depend + make -j${IMPALA_BUILD_THREADS:-4} + make -j${IMPALA_BUILD_THREADS:-4} depend make install fi @@ -233,5 +233,5 @@ fi if [ $BUILD_ALL -eq 1 ] || [ $BUILD_AVRO -eq 1 ]; then build_preamble $IMPALA_HOME/thirdparty/avro-c-${IMPALA_AVRO_VERSION} Avro cmake . - make -j4 + make -j${IMPALA_BUILD_THREADS:-4} fi diff --git a/bin/run-all-tests.sh b/bin/run-all-tests.sh index 2368b88bd..bc82f242f 100755 --- a/bin/run-all-tests.sh +++ b/bin/run-all-tests.sh @@ -22,12 +22,12 @@ set -e . $IMPALA_HOME/bin/set-pythonpath.sh -# Allow picking up strateg from environment +# Allow picking up strategy from environment : ${EXPLORATION_STRATEGY:=core} NUM_ITERATIONS=1 KERB_ARGS="" -. ${IMPALA_HOME}/bin/impala-config.sh +. ${IMPALA_HOME}/bin/impala-config.sh > /dev/null 2>&1 if ${CLUSTER_DIR}/admin is_kerberized; then KERB_ARGS="--use_kerberos" fi @@ -75,7 +75,7 @@ echo "Split and assign HBase regions" # To properly test HBase integeration, HBase regions are split and assigned by this # script. Restarting HBase will change the region server assignment. Run split-hbase.sh # before running any test. -${IMPALA_HOME}/testdata/bin/split-hbase.sh +${IMPALA_HOME}/testdata/bin/split-hbase.sh > /dev/null 2>&1 for i in $(seq 1 $NUM_ITERATIONS) do diff --git a/buildall.sh b/buildall.sh index 7a4db0df1..0e898b23a 100755 --- a/buildall.sh +++ b/buildall.sh @@ -27,7 +27,7 @@ if [ ! -z "${MINIKDC_REALM}" ]; then fi export IMPALA_HOME=$ROOT -. "$ROOT"/bin/impala-config.sh +. "$ROOT"/bin/impala-config.sh > /dev/null 2>&1 CLEAN_ACTION=1 TESTDATA_ACTION=0 @@ -205,7 +205,7 @@ fi # Stop any running Impala services. ${IMPALA_HOME}/bin/start-impala-cluster.py --kill --force -if [ $CLEAN_ACTION -eq 1 ] || [ $FORMAT_METASTORE -eq 1 ] || [ $FORMAT_CLUSTER -eq 1 ] +if [[ $CLEAN_ACTION -eq 1 || $FORMAT_METASTORE -eq 1 || $FORMAT_CLUSTER -eq 1 ]] then # Kill any processes that may be accessing postgres metastore. To be safe, this is done # before we make any changes to the config files. @@ -338,23 +338,25 @@ if [ ${TESTS_ACTION} -eq 1 -a \ exit 1 fi -if [ $TESTDATA_ACTION -eq 1 ] -then - # create and load test data +if [ $TESTDATA_ACTION -eq 1 ]; then + # Create testdata. $IMPALA_HOME/bin/create_testdata.sh - cd $ROOT - if [ "$SNAPSHOT_FILE" != "" ] - then - yes | ${IMPALA_HOME}/testdata/bin/create-load-data.sh $SNAPSHOT_FILE - else - ${IMPALA_HOME}/testdata/bin/create-load-data.sh + # We have three conditions. + # - A testdata and metastore snapshot exists. + # - Only the testdata snapshot exists. + # - Neither of the them exist. + CREATE_LOAD_DATA_ARGS="" + if [ $SNAPSHOT_FILE ] && [ $METASTORE_SNAPSHOT_FILE ]; then + CREATE_LOAD_DATA_ARGS="-snapshot_file ${SNAPSHOT_FILE} -skip_metadata_load" + elif [ $SNAPSHOT_FILE ] && [ -n $METASTORE_SNAPSHOT_FILE ]; then + CREATE_LOAD_DATA_ARGS="-snapshot_file ${SNAPSHOT_FILE}" fi + yes | ${IMPALA_HOME}/testdata/bin/create-load-data.sh ${CREATE_LOAD_DATA_ARGS} fi -if [ $TESTS_ACTION -eq 1 ] -then - ${IMPALA_HOME}/bin/run-all-tests.sh -e $EXPLORATION_STRATEGY +if [ $TESTS_ACTION -eq 1 ]; then + ${IMPALA_HOME}/bin/run-all-tests.sh -e $EXPLORATION_STRATEGY fi # Generate list of files for Cscope to index diff --git a/testdata/avro_schema_resolution/create_table.sql b/testdata/avro_schema_resolution/create_table.sql index f9527f188..e75d389d1 100644 --- a/testdata/avro_schema_resolution/create_table.sql +++ b/testdata/avro_schema_resolution/create_table.sql @@ -25,8 +25,8 @@ INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat' LOCATION '${hiveconf:hive.metastore.warehouse.dir}/avro_schema_resolution_test/'; -LOAD DATA LOCAL INPATH 'records1.avro' OVERWRITE INTO TABLE schema_resolution_test; -LOAD DATA LOCAL INPATH 'records2.avro' INTO TABLE schema_resolution_test; +LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/avro_schema_resolution/records1.avro' OVERWRITE INTO TABLE schema_resolution_test; +LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/avro_schema_resolution/records2.avro' INTO TABLE schema_resolution_test; -- The following tables are used to test Impala's handling of HIVE-6308 which causes -- COMPUTE STATS and Hive's ANALYZE TABLE to fail for Avro tables with mismatched @@ -105,4 +105,4 @@ STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat' LOCATION '/test-warehouse/alltypes_avro_snap' -TBLPROPERTIES ('avro.schema.url'='hdfs://${hiveconf:hive.metastore.warehouse.dir}/avro_schemas/functional/alltypes.json'); \ No newline at end of file +TBLPROPERTIES ('avro.schema.url'='hdfs://${hiveconf:hive.metastore.warehouse.dir}/avro_schemas/functional/alltypes.json'); diff --git a/testdata/bin/check-schema-diff.sh b/testdata/bin/check-schema-diff.sh index 5d1fde947..e003a6816 100755 --- a/testdata/bin/check-schema-diff.sh +++ b/testdata/bin/check-schema-diff.sh @@ -19,12 +19,11 @@ # - 0 implies that the schema diff is emppty, or that a reference githash was not found. # - 1 implies that the schemas have changed. -. ${IMPALA_HOME}/bin/impala-config.sh +. ${IMPALA_HOME}/bin/impala-config.sh > /dev/null 2>&1 set -ex -# If /test-warehouse/githash.txt does not exist, exit with a 0 +DATASET=${1-} hdfs dfs -test -e /test-warehouse/githash.txt || { exit 0; } GIT_HASH=$(echo $(hdfs dfs -cat /test-warehouse/githash.txt)) # Check whether a non-empty diff exists. -# TODO: Make this more granular (on the level of a dataset) -git diff --exit-code ${GIT_HASH}..HEAD ${IMPALA_HOME}/testdata/datasets +git diff --exit-code ${GIT_HASH}..HEAD ${IMPALA_HOME}/testdata/datasets/$DATASET diff --git a/testdata/bin/compute-table-stats.sh b/testdata/bin/compute-table-stats.sh index 42604dbb5..26d4c0907 100755 --- a/testdata/bin/compute-table-stats.sh +++ b/testdata/bin/compute-table-stats.sh @@ -4,7 +4,7 @@ # set -e set -u -. ${IMPALA_HOME}/bin/impala-config.sh +. ${IMPALA_HOME}/bin/impala-config.sh > /dev/null 2>&1 # Run compute stats over as many of the tables used in the Planner tests as possible. python ${IMPALA_HOME}/tests/util/compute_table_stats.py --db_names=functional\ diff --git a/testdata/bin/copy-data-sources.sh b/testdata/bin/copy-data-sources.sh index ee561e3a0..69295d4e1 100755 --- a/testdata/bin/copy-data-sources.sh +++ b/testdata/bin/copy-data-sources.sh @@ -3,7 +3,7 @@ # # This script copies the test data source library into hdfs. -. ${IMPALA_HOME}/bin/impala-config.sh +. ${IMPALA_HOME}/bin/impala-config.sh > /dev/null 2>&1 set -e hadoop fs -mkdir -p /test-warehouse/data-sources/ diff --git a/testdata/bin/copy-udfs-udas.sh b/testdata/bin/copy-udfs-udas.sh index 9ae80b395..58baa7cf8 100755 --- a/testdata/bin/copy-udfs-udas.sh +++ b/testdata/bin/copy-udfs-udas.sh @@ -7,7 +7,7 @@ if [ x${JAVA_HOME} == x ]; then echo JAVA_HOME not set exit 1 fi -. ${IMPALA_HOME}/bin/impala-config.sh +. ${IMPALA_HOME}/bin/impala-config.sh > /dev/null 2>&1 set -e BUILD=0 diff --git a/testdata/bin/create-hbase.sh b/testdata/bin/create-hbase.sh index f9a696b75..73cc0249b 100755 --- a/testdata/bin/create-hbase.sh +++ b/testdata/bin/create-hbase.sh @@ -1,7 +1,7 @@ #!/bin/bash # Copyright (c) 2012 Cloudera, Inc. All rights reserved. -. ${IMPALA_HOME}/bin/impala-config.sh +. ${IMPALA_HOME}/bin/impala-config.sh > /dev/null 2>&1 # To work around the HBase bug (HBASE-4467), unset $HADOOP_HOME before calling hbase HADOOP_HOME= diff --git a/testdata/bin/create-load-data.sh b/testdata/bin/create-load-data.sh index b35d12702..aee23df61 100755 --- a/testdata/bin/create-load-data.sh +++ b/testdata/bin/create-load-data.sh @@ -24,219 +24,309 @@ # For more information look at testdata/bin/load-test-warehouse-snapshot.sh and # bin/load-data.py -. ${IMPALA_HOME}/bin/impala-config.sh -set -ex - -# Setup for HDFS caching -${IMPALA_HOME}/testdata/bin/setup-hdfs-caching.sh - -# If the user has specified a command line argument, treat it as the test-warehouse -# snapshot file and pass it to the load-test-warehouse-snapshot.sh script for processing. -if [[ $1 ]]; then - ${IMPALA_HOME}/testdata/bin/load-test-warehouse-snapshot.sh "$1" -else - echo "Loading hive builtins" - ${IMPALA_HOME}/testdata/bin/load-hive-builtins.sh - - echo "Generating HBase data" - ${IMPALA_HOME}/testdata/bin/create-hbase.sh -fi -set -u +# Exit on error. +set -e +. ${IMPALA_HOME}/bin/impala-config.sh > /dev/null 2>&1 +SKIP_METADATA_LOAD=0 +SKIP_SNAPSHOT_LOAD=0 +SNAPSHOT_FILE="" +LOAD_DATA_ARGS="" +JDBC_URL="jdbc:hive2://localhost:11050/default;" DATA_LOADING_LOG_DIR=${IMPALA_TEST_CLUSTER_LOG_DIR}/data_loading mkdir -p ${DATA_LOADING_LOG_DIR} -# Copy the test data source library into HDFS -${IMPALA_HOME}/testdata/bin/copy-data-sources.sh +while [ -n "$*" ] +do + case $1 in + -skip_metadata_load) + SKIP_METADATA_LOAD=1 + ;; + -skip_snapshot_load) + SKIP_SNAPSHOT_LOAD=1 + ;; + -snapshot_file) + SNAPSHOT_FILE=${2-} + if [ ! -f $SNAPSHOT_FILE ]; then + echo "-snapshot_file does not exist: $SNAPSHOT_FILE" + exit 1; + fi + shift; + ;; + -help|-h|*) + echo "create-load-data.sh : Creates data and loads from scratch" + echo "[-skip_metadata_load] : Skips loading of metadata" + echo "[-skip_snapshot_load] : Assumes that the snapshot is already loaded" + echo "[-snapshot_file] : Loads the test warehouse snapshot into hdfs" + exit 1; + ;; + esac + shift; +done -# If a schema change is detected, force load the data. -set +e -LOAD_DATA_ARGS="" -${IMPALA_HOME}/testdata/bin/check-schema-diff.sh -if [[ $? -eq 1 ]]; then - LOAD_DATA_ARGS="--force" +if [[ $SKIP_METADATA_LOAD -eq 0 && "$SNAPSHOT_FILE" = "" ]]; then + echo "Loading Hive Builtins" + ${IMPALA_HOME}/testdata/bin/load-hive-builtins.sh + echo "Generating HBase data" + ${IMPALA_HOME}/testdata/bin/create-hbase.sh &> ${DATA_LOADING_LOG_DIR}/create-hbase.log +elif [ $SKIP_SNAPSHOT_LOAD -eq 0 ]; then + echo Loading hdfs data from snapshot: $SNAPSHOT_FILE + ${IMPALA_HOME}/testdata/bin/load-test-warehouse-snapshot.sh "$SNAPSHOT_FILE" + # Don't skip the metadata load if a schema change is detected. + if ! ${IMPALA_HOME}/testdata/bin/check-schema-diff.sh; then + echo "Schema change detected, metadata will be loaded." + SKIP_METADATA_LOAD=0 + fi +else + # hdfs data already exists, don't load it. + echo Skipping loading data to hdfs. fi +function load-custom-schemas { + echo LOADING CUSTOM SCHEMAS + SCHEMA_SRC_DIR=${IMPALA_HOME}/testdata/data/schemas + SCHEMA_DEST_DIR=/test-warehouse/schemas + # clean the old schemas directory. + hadoop fs -rm -r -f ${SCHEMA_DEST_DIR} + hadoop fs -mkdir ${SCHEMA_DEST_DIR} + hadoop fs -put $SCHEMA_SRC_DIR/zipcode_incomes.parquet ${SCHEMA_DEST_DIR}/ + hadoop fs -put $SCHEMA_SRC_DIR/unsupported.parquet ${SCHEMA_DEST_DIR}/ + hadoop fs -put $SCHEMA_SRC_DIR/map.parquet ${SCHEMA_DEST_DIR}/ + hadoop fs -put $SCHEMA_SRC_DIR/array.parquet ${SCHEMA_DEST_DIR}/ + hadoop fs -put $SCHEMA_SRC_DIR/struct.parquet ${SCHEMA_DEST_DIR}/ + hadoop fs -put $SCHEMA_SRC_DIR/alltypestiny.parquet ${SCHEMA_DEST_DIR}/ + hadoop fs -put $SCHEMA_SRC_DIR/malformed_decimal_tiny.parquet ${SCHEMA_DEST_DIR}/ + hadoop fs -put $SCHEMA_SRC_DIR/decimal.parquet ${SCHEMA_DEST_DIR}/ + + # CHAR and VARCHAR tables written by Hive + hadoop fs -mkdir -p /test-warehouse/chars_formats_avro_snap/ + hadoop fs -put -f ${IMPALA_HOME}/testdata/data/chars-formats.avro \ + /test-warehouse/chars_formats_avro_snap + hadoop fs -mkdir -p /test-warehouse/chars_formats_parquet/ + hadoop fs -put -f ${IMPALA_HOME}/testdata/data/chars-formats.parquet \ + /test-warehouse/chars_formats_parquet + hadoop fs -mkdir -p /test-warehouse/chars_formats_text/ + hadoop fs -put -f ${IMPALA_HOME}/testdata/data/chars-formats.txt \ + /test-warehouse/chars_formats_text +} + +function load-data { + WORKLOAD=${1} + EXPLORATION_STRATEGY=${2:-"core"} + TABLE_FORMATS=${3:-} + + MSG="Loading workload '$WORKLOAD'" + ARGS=("--workloads $WORKLOAD") + MSG+=" Using exploration strategy '$EXPLORATION_STRATEGY'" + ARGS+=("-e $EXPLORATION_STRATEGY") + if [ $TABLE_FORMATS ]; then + MSG+=" in table formats '$TABLE_FORMATS'" + ARGS+=("--table_formats $TABLE_FORMATS") + fi + if [ $LOAD_DATA_ARGS ]; then + ARGS+=("$LOAD_DATA_ARGS") + fi + # functional-query is unique. The dataset name is not the same as the workload name. + if [ "${WORKLOAD}" = "functional-query" ]; then + WORKLOAD="functional" + fi + # Force load the dataset if we detect a schema change. + if ! ${IMPALA_HOME}/testdata/bin/check-schema-diff.sh $WORKLOAD; then + ARGS+=("--force") + echo "Force loading $WORKLOAD because a schema change was detected" + fi + LOG_FILE=${DATA_LOADING_LOG_DIR}/data-load-${WORKLOAD}-${EXPLORATION_STRATEGY}.log + echo "$MSG. Logging to ${LOG_FILE}" + # Use unbuffered logging by executing with 'python -u' + python -u ${IMPALA_HOME}/bin/load-data.py ${ARGS[@]} &> ${LOG_FILE} +} + +function cache-test-tables { + echo CACHING tpch.nation AND functional.alltypestiny + # uncaching the tables first makes this operation idempotent. + ${IMPALA_HOME}/bin/impala-shell.sh -q "alter table functional.alltypestiny set uncached" + ${IMPALA_HOME}/bin/impala-shell.sh -q "alter table tpch.nation set uncached" + ${IMPALA_HOME}/bin/impala-shell.sh -q "alter table tpch.nation set cached in 'testPool'" + ${IMPALA_HOME}/bin/impala-shell.sh -q\ + "alter table functional.alltypestiny set cached in 'testPool'" +} + +function load-aux-workloads { + echo LOADING AUXILIARY WORKLOADS + LOG_FILE=${DATA_LOADING_LOG_DIR}/data-load-auxiliary-workloads-core.log + rm -f $LOG_FILE + # Load all the auxiliary workloads (if any exist) + if [ -d ${IMPALA_AUX_WORKLOAD_DIR} ] && [ -d ${IMPALA_AUX_DATASET_DIR} ]; then + python -u ${IMPALA_HOME}/bin/load-data.py --workloads all\ + --workload_dir=${IMPALA_AUX_WORKLOAD_DIR}\ + --dataset_dir=${IMPALA_AUX_DATASET_DIR}\ + --exploration_strategy=core ${LOAD_DATA_ARGS} &>> $LOG_FILE + else + echo "Skipping load of auxilary workloads because directories do not exist" + fi +} + +function copy-auth-policy { + echo COPYING AUTHORIZATION POLICY FILE + hadoop fs -rm -f /test-warehouse/authz-policy.ini + hadoop fs -put ${IMPALA_HOME}/fe/src/test/resources/authz-policy.ini /test-warehouse/ +} + +function copy-and-load-dependent-tables { + # COPY + # TODO: The multi-format table will move these files. So we need to copy them to a + # temporary location for that table to use. Should find a better way to handle this. + echo COPYING AND LOADING DATA FOR DEPENDENT TABLES + hadoop fs -rm -r -f /test-warehouse/alltypesmixedformat + hadoop fs -rm -r -f /tmp/alltypes_rc + hadoop fs -rm -r -f /tmp/alltypes_seq + hadoop fs -mkdir -p /tmp/alltypes_seq/year=2009 + hadoop fs -mkdir -p /tmp/alltypes_rc/year=2009 + hadoop fs -cp /test-warehouse/alltypes_seq/year=2009/month=2/ /tmp/alltypes_seq/year=2009 + hadoop fs -cp /test-warehouse/alltypes_rc/year=2009/month=3/ /tmp/alltypes_rc/year=2009 + + # Create a hidden file in AllTypesSmall + hadoop fs -rm -f /test-warehouse/alltypessmall/year=2009/month=1/_hidden + hadoop fs -rm -f /test-warehouse/alltypessmall/year=2009/month=1/.hidden + hadoop fs -cp /test-warehouse/zipcode_incomes/DEC_00_SF3_P077_with_ann_noheader.csv \ + /test-warehouse/alltypessmall/year=2009/month=1/_hidden + hadoop fs -cp /test-warehouse/zipcode_incomes/DEC_00_SF3_P077_with_ann_noheader.csv \ + /test-warehouse/alltypessmall/year=2009/month=1/.hidden + + # For tables that rely on loading data from local fs test-warehouse + # TODO: Find a good way to integrate this with the normal data loading scripts + beeline -n $USER -u "${JDBC_URL}" -f\ + ${IMPALA_HOME}/testdata/bin/load-dependent-tables.sql +} + +function create-internal-hbase-table { + echo CREATING INTERNAL HBASE TABLE + # TODO: For some reason DROP TABLE IF EXISTS sometimes fails on HBase if the table does + # not exist. To work around this, disable exit on error before executing this command. + # Need to investigate this more, but this works around the problem to unblock automation. + set +o errexit + beeline -n $USER -u "${JDBC_URL}" -e\ + "DROP TABLE IF EXISTS functional_hbase.internal_hbase_table" + echo "disable 'functional_hbase.internal_hbase_table'" | hbase shell + echo "drop 'functional_hbase.internal_hbase_table'" | hbase shell + set -e + # Used by CatalogTest to confirm that non-external HBase tables are identified + # correctly (IMP-581) + # Note that the usual 'hbase.table.name' property is not specified to avoid + # creating tables in HBase as a side-effect. + cat > /tmp/create-hbase-internal.sql << EOF + CREATE TABLE functional_hbase.internal_hbase_table(key int, value string) + STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler' + WITH SERDEPROPERTIES ("hbase.columns.mapping" = ":key,cf1:val"); +EOF + beeline -n $USER -u "${JDBC_URL}" -f /tmp/create-hbase-internal.sql + rm -f /tmp/create-hbase-internal.sql +} + +function load-custom-data { + echo LOADING CUSTOM DATA + # Load the index files for corrupted lzo data. + hadoop fs -rm -f /test-warehouse/bad_text_lzo_text_lzo/bad_text.lzo.index + hadoop fs -put ${IMPALA_HOME}/testdata/bad_text_lzo/bad_text.lzo.index \ + /test-warehouse/bad_text_lzo_text_lzo/ + + hadoop fs -rm -r -f /bad_text_lzo_text_lzo/ + hadoop fs -mv /test-warehouse/bad_text_lzo_text_lzo/ / + # Cleanup the old bad_text_lzo files, if they exist. + hadoop fs -rm -r -f /test-warehouse/bad_text_lzo/ + + # Index all lzo files in HDFS under /test-warehouse + ${IMPALA_HOME}/testdata/bin/lzo_indexer.sh /test-warehouse + + hadoop fs -mv /bad_text_lzo_text_lzo/ /test-warehouse/ + + # IMPALA-694: data file produced by parquet-mr version 1.2.5-cdh4.5.0 + hadoop fs -put -f ${IMPALA_HOME}/testdata/data/bad_parquet_data.parquet \ + /test-warehouse/bad_parquet_parquet + + # Data file produced by parquet-mr with repeated values (produces 0 bit width dictionary) + hadoop fs -put -f ${IMPALA_HOME}/testdata/data/repeated_values.parquet \ + /test-warehouse/bad_parquet_parquet + + # IMPALA-720: data file produced by parquet-mr with multiple row groups + hadoop fs -put -f ${IMPALA_HOME}/testdata/data/multiple_rowgroups.parquet \ + /test-warehouse/bad_parquet_parquet + + # IMPALA-1401: data file produced by Hive 13 containing page statistics with long min/max + # string values + hadoop fs -put -f ${IMPALA_HOME}/testdata/data/long_page_header.parquet \ + /test-warehouse/bad_parquet_parquet + + # Remove an index file so we test an un-indexed LZO file + hadoop fs -rm /test-warehouse/alltypes_text_lzo/year=2009/month=1/000000_0.lzo.index + + # Add a sequence file that only contains a header (see IMPALA-362) + hadoop fs -put -f ${IMPALA_HOME}/testdata/tinytable_seq_snap/tinytable_seq_snap_header_only \ + /test-warehouse/tinytable_seq_snap + + beeline -n $USER -u "${JDBC_URL}" -f\ + ${IMPALA_HOME}/testdata/avro_schema_resolution/create_table.sql +} + +function build-and-copy-hive-udfs { + # Build the test Hive UDFs + pushd ${IMPALA_HOME}/tests/test-hive-udfs + mvn clean package + popd + # Copy the test UDF/UDA libraries into HDFS + ${IMPALA_HOME}/testdata/bin/copy-udfs-udas.sh +} + +function copy-and-load-ext-data-source { + # Copy the test data source library into HDFS + ${IMPALA_HOME}/testdata/bin/copy-data-sources.sh + # Create data sources table. + ${IMPALA_HOME}/bin/impala-shell.sh -f\ + ${IMPALA_HOME}/testdata/bin/create-data-source-table.sql +} + + +# Enable debug logging. +set -x + + # For kerberized clusters, use kerberos if ${CLUSTER_DIR}/admin is_kerberized; then LOAD_DATA_ARGS="${LOAD_DATA_ARGS} --use_kerberos --principal=${MINIKDC_PRINC_HIVE}" fi -set -e +# Start Impala +${IMPALA_HOME}/bin/start-impala-cluster.py -s 3 --log_dir=${DATA_LOADING_LOG_DIR} +${IMPALA_HOME}/testdata/bin/setup-hdfs-caching.sh -# Load schemas -hadoop fs -rm -r -f /test-warehouse/schemas -hadoop fs -mkdir /test-warehouse/schemas -hadoop fs -put ${IMPALA_HOME}/testdata/data/schemas/zipcode_incomes.parquet \ - /test-warehouse/schemas/ -hadoop fs -put ${IMPALA_HOME}/testdata/data/schemas/unsupported.parquet \ - /test-warehouse/schemas/ -hadoop fs -put ${IMPALA_HOME}/testdata/data/schemas/map.parquet \ - /test-warehouse/schemas/ -hadoop fs -put ${IMPALA_HOME}/testdata/data/schemas/array.parquet \ - /test-warehouse/schemas/ -hadoop fs -put ${IMPALA_HOME}/testdata/data/schemas/struct.parquet \ - /test-warehouse/schemas/ -hadoop fs -put ${IMPALA_HOME}/testdata/data/schemas/alltypestiny.parquet \ - /test-warehouse/schemas/ -hadoop fs -put ${IMPALA_HOME}/testdata/data/schemas/malformed_decimal_tiny.parquet \ - /test-warehouse/schemas/ -hadoop fs -put ${IMPALA_HOME}/testdata/data/schemas/decimal.parquet \ - /test-warehouse/schemas/ - -# CHAR and VARCHAR tables written by Hive -hadoop fs -mkdir -p /test-warehouse/chars_formats_avro_snap/ -hadoop fs -put -f ${IMPALA_HOME}/testdata/data/chars-formats.avro \ - /test-warehouse/chars_formats_avro_snap -hadoop fs -mkdir -p /test-warehouse/chars_formats_parquet/ -hadoop fs -put -f ${IMPALA_HOME}/testdata/data/chars-formats.parquet \ - /test-warehouse/chars_formats_parquet -hadoop fs -mkdir -p /test-warehouse/chars_formats_text/ -hadoop fs -put -f ${IMPALA_HOME}/testdata/data/chars-formats.txt \ - /test-warehouse/chars_formats_text - -# Load the data set -pushd ${IMPALA_HOME}/bin -./start-impala-cluster.py -s 3 --wait_for_cluster --log_dir=${DATA_LOADING_LOG_DIR} - -function load-data { - WORKLOAD=$1 - EXPLORATION_STRATEGY=$2 - LOG_FILE=${DATA_LOADING_LOG_DIR}/data-load-${WORKLOAD}-${EXPLORATION_STRATEGY}.log - echo "Loading workload: ${WORKLOAD} (${EXPLORATION_STRATEGY}). Logging to: "\ - "${LOG_FILE}" - # Use unbuffered logging by executing with 'python -u' - python -u ./load-data.py --workloads ${WORKLOAD} \ - --exploration_strategy ${EXPLORATION_STRATEGY} ${LOAD_DATA_ARGS} &> ${LOG_FILE} -} - -load-data "functional-query" "exhaustive" -load-data "tpch" "core" -load-data "tpcds" "core" - -# Cache test tables -./impala-shell.sh -q "alter table tpch.nation set cached in 'testPool'" -./impala-shell.sh -q "alter table functional.alltypestiny set cached in 'testPool'" - -# Load the test data source and table -./impala-shell.sh -f ${IMPALA_HOME}/testdata/bin/create-data-source-table.sql -# Load all the auxiliary workloads (if any exist) -if [ -d ${IMPALA_AUX_WORKLOAD_DIR} ] && [ -d ${IMPALA_AUX_DATASET_DIR} ]; then - python -u ./load-data.py --workloads all --workload_dir=${IMPALA_AUX_WORKLOAD_DIR}\ - --dataset_dir=${IMPALA_AUX_DATASET_DIR} --exploration_strategy core \ - ${LOAD_DATA_ARGS} +if [ $SKIP_METADATA_LOAD -eq 0 ]; then + # load custom schems + load-custom-schemas + # load functional/tpcds/tpch + load-data "functional-query" "exhaustive" + load-data "tpch" "core" + load-data "tpcds" "core" + load-aux-workloads + copy-and-load-dependent-tables + load-custom-data + ${IMPALA_HOME}/testdata/bin/create-table-many-blocks.sh -p 1234 -b 1 + build-and-copy-hive-udfs else - echo "Skipping load of auxilary workloads because directories do not exist" + echo "Skipped loading the metadata. Loading HBase." + load-data "functional-query" "core" "hbase/none" fi -popd -# Create a table w/ 1234 partitions. Used to validate fetching/updating partitions in -# batches. -${IMPALA_HOME}/testdata/bin/create-table-many-blocks.sh -p 1234 -b 1 -# Split HBase table -echo "Splitting HBase table" -${IMPALA_HOME}/testdata/bin/split-hbase.sh - -echo COPYING AUTHORIZATION POLICY FILE -hadoop fs -rm -f /test-warehouse/authz-policy.ini -hadoop fs -put ${IMPALA_HOME}/fe/src/test/resources/authz-policy.ini /test-warehouse/ - -# TODO: The multi-format table will move these files. So we need to copy them to a -# temporary location for that table to use. Should find a better way to handle this. -echo COPYING DATA FOR DEPENDENT TABLES -hadoop fs -rm -r -f /test-warehouse/alltypesmixedformat -hadoop fs -rm -r -f /tmp/alltypes_rc -hadoop fs -rm -r -f /tmp/alltypes_seq -hadoop fs -mkdir -p /tmp/alltypes_seq/year=2009 -hadoop fs -mkdir -p /tmp/alltypes_rc/year=2009 -hadoop fs -cp /test-warehouse/alltypes_seq/year=2009/month=2/ /tmp/alltypes_seq/year=2009 -hadoop fs -cp /test-warehouse/alltypes_rc/year=2009/month=3/ /tmp/alltypes_rc/year=2009 - -# Create a hidden file in AllTypesSmall -hadoop fs -rm -f /test-warehouse/alltypessmall/year=2009/month=1/_hidden -hadoop fs -rm -f /test-warehouse/alltypessmall/year=2009/month=1/.hidden -hadoop fs -cp /test-warehouse/zipcode_incomes/DEC_00_SF3_P077_with_ann_noheader.csv \ - /test-warehouse/alltypessmall/year=2009/month=1/_hidden -hadoop fs -cp /test-warehouse/zipcode_incomes/DEC_00_SF3_P077_with_ann_noheader.csv \ - /test-warehouse/alltypessmall/year=2009/month=1/.hidden - -# Configure alltypes_seq as a read-only table +# Configure alltypes_seq as a read-only table. This is required for fe tests. hadoop fs -chmod -R 444 /test-warehouse/alltypes_seq/year=2009/month=1 hadoop fs -chmod -R 444 /test-warehouse/alltypes_seq/year=2009/month=3 - -# TODO: For some reason DROP TABLE IF EXISTS sometimes fails on HBase if the table does -# not exist. To work around this, disable exit on error before executing this command. -# Need to investigate this more, but this works around the problem to unblock automation. -set +o errexit -${HIVE_HOME}/bin/hive -hiveconf hive.root.logger=WARN,console -v \ - -e "DROP TABLE IF EXISTS functional_hbase.internal_hbase_table" -echo "disable 'functional_hbase.internal_hbase_table'" | hbase shell -echo "drop 'functional_hbase.internal_hbase_table'" | hbase shell -set -e - -# For tables that rely on loading data from local fs test-warehouse -# TODO: Find a good way to integrate this with the normal data loading scripts -${HIVE_HOME}/bin/hive -hiveconf hive.root.logger=WARN,console -v \ - -f ${IMPALA_HOME}/testdata/bin/load-dependent-tables.sql -if [ $? != 0 ]; then - echo DEPENDENT LOAD FAILED - exit 1 -fi - -# Load the index files for corrupted lzo data. -hadoop fs -rm -f /test-warehouse/bad_text_lzo_text_lzo/bad_text.lzo.index -hadoop fs -put ${IMPALA_HOME}/testdata/bad_text_lzo/bad_text.lzo.index \ - /test-warehouse/bad_text_lzo_text_lzo/ - -hadoop fs -rm -r -f /bad_text_lzo_text_lzo/ -hadoop fs -mv /test-warehouse/bad_text_lzo_text_lzo/ / -# Cleanup the old bad_text_lzo files, if they exist. -hadoop fs -rm -r -f /test-warehouse/bad_text_lzo/ - -# Index all lzo files in HDFS under /test-warehouse -${IMPALA_HOME}/testdata/bin/lzo_indexer.sh /test-warehouse - -hadoop fs -mv /bad_text_lzo_text_lzo/ /test-warehouse/ - -# IMPALA-694: data file produced by parquet-mr version 1.2.5-cdh4.5.0 -hadoop fs -put -f ${IMPALA_HOME}/testdata/data/bad_parquet_data.parquet \ - /test-warehouse/bad_parquet_parquet - -# Data file produced by parquet-mr with repeated values (produces 0 bit width dictionary) -hadoop fs -put -f ${IMPALA_HOME}/testdata/data/repeated_values.parquet \ - /test-warehouse/bad_parquet_parquet - -# IMPALA-720: data file produced by parquet-mr with multiple row groups -hadoop fs -put -f ${IMPALA_HOME}/testdata/data/multiple_rowgroups.parquet \ - /test-warehouse/bad_parquet_parquet - -# IMPALA-1401: data file produced by Hive 13 containing page statistics with long min/max -# string values -hadoop fs -put -f ${IMPALA_HOME}/testdata/data/long_page_header.parquet \ - /test-warehouse/bad_parquet_parquet - -# Remove an index file so we test an un-indexed LZO file -hadoop fs -rm /test-warehouse/alltypes_text_lzo/year=2009/month=1/000000_0.lzo.index - -# Add a sequence file that only contains a header (see IMPALA-362) -hadoop fs -put -f ${IMPALA_HOME}/testdata/tinytable_seq_snap/tinytable_seq_snap_header_only \ - /test-warehouse/tinytable_seq_snap - -# Create special table for testing Avro schema resolution -# (see testdata/avro_schema_resolution/README) -pushd ${IMPALA_HOME}/testdata/avro_schema_resolution -hive -f create_table.sql -popd - +cache-test-tables +copy-and-load-ext-data-source +# The tests need the built hive-udfs jar on the local fs +build-and-copy-hive-udfs +${IMPALA_HOME}/testdata/bin/split-hbase.sh > /dev/null 2>&1 +create-internal-hbase-table +# TODO: Investigate why all stats are not preserved. Theorectically, we only need to +# recompute stats for HBase. ${IMPALA_HOME}/testdata/bin/compute-table-stats.sh - -# Build the test Hive UDFs -pushd ${IMPALA_HOME}/tests/test-hive-udfs -mvn clean package -popd - -# Copy the test UDF/UDA libraries into HDFS -${IMPALA_HOME}/testdata/bin/copy-udfs-udas.sh - -${IMPALA_HOME}/bin/start-impala-cluster.py --kill_only +copy-auth-policy diff --git a/testdata/bin/create-table-many-blocks.sh b/testdata/bin/create-table-many-blocks.sh index a834bed5a..613f96f0e 100755 --- a/testdata/bin/create-table-many-blocks.sh +++ b/testdata/bin/create-table-many-blocks.sh @@ -20,7 +20,7 @@ # way a table with 100K blocks can be created by using 100 partitions x 1000 # blocks/files. -. ${IMPALA_HOME}/bin/impala-config.sh +. ${IMPALA_HOME}/bin/impala-config.sh > /dev/null 2>&1 set -e set -u diff --git a/testdata/bin/load-dependent-tables.sql b/testdata/bin/load-dependent-tables.sql index b6b752e52..7bb5e1fd6 100644 --- a/testdata/bin/load-dependent-tables.sql +++ b/testdata/bin/load-dependent-tables.sql @@ -49,15 +49,6 @@ ALTER TABLE alltypesmixedformat PARTITION (year=2009, month=2) ALTER TABLE alltypesmixedformat PARTITION (year=2009, month=3) SET FILEFORMAT RCFILE; ----- --- Used by CatalogTest to confirm that non-external HBase tables are identified --- correctly (IMP-581) --- Note that the usual 'hbase.table.name' property is not specified to avoid --- creating tables in HBase as a side-effect. -CREATE TABLE functional_hbase.internal_hbase_table(key int, value string) -STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler' -WITH SERDEPROPERTIES ("hbase.columns.mapping" = ":key,cf1:val"); - ---- Unsupported Impala table types USE functional; CREATE VIEW IF NOT EXISTS hive_view AS SELECT 1 AS int_col FROM alltypes limit 1; diff --git a/testdata/bin/load-hive-builtins.sh b/testdata/bin/load-hive-builtins.sh index 639c97a63..a3e647f19 100755 --- a/testdata/bin/load-hive-builtins.sh +++ b/testdata/bin/load-hive-builtins.sh @@ -1,14 +1,14 @@ #!/bin/bash -. ${IMPALA_HOME}/bin/impala-config.sh +. ${IMPALA_HOME}/bin/impala-config.sh > /dev/null 2>&1 # TODO: remove this once we understand why Hive looks in HDFS for many of its jars -${HADOOP_HOME}/bin/hadoop fs -rm -r -f ${HIVE_HOME}/lib/ -${HADOOP_HOME}/bin/hadoop fs -mkdir -p ${HIVE_HOME}/lib/ +${HADOOP_HOME}/bin/hadoop fs -rm -r -f ${HIVE_HOME}/lib/ +${HADOOP_HOME}/bin/hadoop fs -mkdir -p ${HIVE_HOME}/lib/ ${HADOOP_HOME}/bin/hadoop fs -put ${HIVE_HOME}/lib/*.jar ${HIVE_HOME}/lib/ -${HADOOP_HOME}/bin/hadoop fs -rm -r -f ${HBASE_HOME}/lib/ -${HADOOP_HOME}/bin/hadoop fs -mkdir -p ${HBASE_HOME}/lib/ +${HADOOP_HOME}/bin/hadoop fs -rm -r -f ${HBASE_HOME}/lib/ +${HADOOP_HOME}/bin/hadoop fs -mkdir -p ${HBASE_HOME}/lib/ ${HADOOP_HOME}/bin/hadoop fs -put ${HBASE_HOME}/lib/*.jar ${HBASE_HOME}/lib/ ${HADOOP_HOME}/bin/hadoop fs -rm -r -f ${HADOOP_HOME}/share/hadoop/common/ diff --git a/testdata/bin/load-metastore-snapshot.sh b/testdata/bin/load-metastore-snapshot.sh index ecc187dfe..89a2c4135 100755 --- a/testdata/bin/load-metastore-snapshot.sh +++ b/testdata/bin/load-metastore-snapshot.sh @@ -53,6 +53,10 @@ dropdb -U hiveuser hive_impala createdb -U hiveuser hive_impala # Copy the contents of the SNAPSHOT_FILE psql -U hiveuser hive_impala < ${SNAPSHOT_FILE} > /dev/null 2>&1 - - - +# Two tables (tpch.nation and functional.alltypestiny) have cache_directive_id set in +# their metadata. These directives are now stale, and will cause any query that attempts +# to cache the data in the tables to fail. +psql -U hiveuser -d hive_impala -c \ + "delete from \"TABLE_PARAMS\" where \"PARAM_KEY\"='cache_directive_id'" +psql -U hiveuser -d hive_impala -c \ + "delete from \"PARTITION_PARAMS\" where \"PARAM_KEY\"='cache_directive_id'" diff --git a/testdata/bin/load-test-warehouse-snapshot.sh b/testdata/bin/load-test-warehouse-snapshot.sh index af94237a4..415241e6e 100755 --- a/testdata/bin/load-test-warehouse-snapshot.sh +++ b/testdata/bin/load-test-warehouse-snapshot.sh @@ -20,7 +20,7 @@ # NOTE: Running this script will remove your existing test-warehouse directory. Be sure # to backup any data you need before running this script. -. ${IMPALA_HOME}/bin/impala-config.sh +. ${IMPALA_HOME}/bin/impala-config.sh > /dev/null 2>&1 TEST_WAREHOUSE_HDFS_DIR=/test-warehouse if [[ ! $1 ]]; then @@ -64,6 +64,11 @@ mkdir ${SNAPSHOT_STAGING_DIR} echo "Extracting tarball" tar -C ${SNAPSHOT_STAGING_DIR} -xzf ${SNAPSHOT_FILE} +if [ ! -f ${SNAPSHOT_STAGING_DIR}/test-warehouse/githash.txt ]; then + echo "The test-warehouse snapshot does not containa githash, aborting load" + exit 1 +fi + echo "Copying data to HDFS" hadoop fs -put ${SNAPSHOT_STAGING_DIR}/test-warehouse/* ${TEST_WAREHOUSE_HDFS_DIR} diff --git a/testdata/workloads/tpch/queries/tpch-q15.test b/testdata/workloads/tpch/queries/tpch-q15.test index b586607f2..73634044b 100644 --- a/testdata/workloads/tpch/queries/tpch-q15.test +++ b/testdata/workloads/tpch/queries/tpch-q15.test @@ -35,4 +35,4 @@ order by 8449,'Supplier#000008449','Wp34zim9qYFbVctdW','20-469-856-8873',1772627.2087 ---- TYPES BIGINT, STRING, STRING, STRING, DECIMAL -==== \ No newline at end of file +==== diff --git a/tests/util/compute_table_stats.py b/tests/util/compute_table_stats.py index 14bddf9a2..e4b2e2ab8 100755 --- a/tests/util/compute_table_stats.py +++ b/tests/util/compute_table_stats.py @@ -29,17 +29,10 @@ def compute_stats(impala_client, db_names=None, table_names=None, all_dbs = set(name.lower() for name in impala_client.execute("show databases").data) selected_dbs = all_dbs if db_names is None else set(db_names) - if db_names is not None: - print 'Skipping compute stats on databases:\n%s' % '\n'.join(all_dbs - selected_dbs) - for db in all_dbs.intersection(selected_dbs): all_tables =\ set([t.lower() for t in impala_client.execute("show tables in %s" % db).data]) selected_tables = all_tables if table_names is None else set(table_names) - if table_names: - print 'Skipping compute stats on tables:\n%s' %\ - '\n'.join(['%s.%s' % (db, tbl) for tbl in all_tables - selected_tables]) - for table in all_tables.intersection(selected_tables): statement = "compute stats %s.%s" % (db, table) print 'Executing: %s' % statement