mirror of
https://github.com/apache/impala.git
synced 2025-12-19 09:58:28 -05:00
What works:
* A single node cluster can be started up with docker-compose
* HMS data is stored in Derby database in a docker volume
* Filesystem data is stored in a shared docker volume, using the
localfs support in the Hadoop client.
* A Kudu cluster with a single master can be optionally added on
to the Impala cluster.
* TPC-DS data can be loaded automatically by a data loading container.
We need to set up a docker network called quickstart-network,
purely because docker-compose insists on generating network names
with underscores, which are part of the FQDN and end up causing
problems with Java's URL parsing, which rejects these technically
invalid domain names.
How to run:
Instructions for running the quickstart cluster are in
docker/README.md.
How to build containers:
./buildall.sh -release -noclean -notests -ninja
ninja quickstart_hms_image quickstart_client_image docker_images
How to upload containers to dockerhub:
IMPALA_QUICKSTART_IMAGE_PREFIX=timgarmstrong/
for i in impalad_coord_exec impalad_coordinator statestored \
impalad_executor catalogd impala_quickstart_client \
impala_quickstart_hms
do
docker tag $i ${IMPALA_QUICKSTART_IMAGE_PREFIX}$i
docker push ${IMPALA_QUICKSTART_IMAGE_PREFIX}$i
done
I pushed containers build from commit f260cce22, which
was branched from 6cb7cecacf on master.
Misc other stuff:
* Added more metadata to all images.
TODO:
* Test and instructions to run against Kudu quickstart
* Upload latest version of containers before merging.
Change-Id: Ifc0b862af40a368381ada7ec2a355fe4b0aa778c
Reviewed-on: http://gerrit.cloudera.org:8080/15966
Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
87 lines
3.2 KiB
Bash
Executable File
87 lines
3.2 KiB
Bash
Executable File
#!/bin/bash
|
|
################################################################################
|
|
# Licensed to the Apache Software Foundation (ASF) under one
|
|
# or more contributor license agreements. See the NOTICE file
|
|
# distributed with this work for additional information
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
# to you under the Apache License, Version 2.0 (the
|
|
# "License"); you may not use this file except in compliance
|
|
# with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
################################################################################
|
|
#
|
|
# This script follows the pattern described in the docker best practices here:
|
|
# https://docs.docker.com/develop/develop-images/dockerfile_best-practices/#entrypoint
|
|
################################################################################
|
|
|
|
set -euo pipefail
|
|
set -x
|
|
|
|
if [[ $# -eq 0 ]]; then
|
|
echo "Must provide at least one argument."
|
|
exit 1
|
|
elif [[ "$1" = "load_tpcds" ]]; then
|
|
echo "Loading TPC-DS data"
|
|
IMPALA_TOOLCHAIN_BASE=https://native-toolchain.s3.amazonaws.com/build/7-f2ddef91e9/
|
|
TPCDS_VERSION=2.1.0
|
|
TPCDS_TARBALL=tpc-ds-${TPCDS_VERSION}-gcc-4.9.2-ec2-package-ubuntu-18-04.tar.gz
|
|
TPCDS_URL=${IMPALA_TOOLCHAIN_BASE}tpc-ds/${TPCDS_VERSION}-gcc-4.9.2/${TPCDS_TARBALL}
|
|
|
|
curl ${TPCDS_URL} --output tpcds.tar.gz
|
|
tar xzf tpcds.tar.gz
|
|
|
|
# The base directory for Hive external tables, in a mounted volume.
|
|
WAREHOUSE_EXTERNAL_DIR=/user/hive/warehouse/external
|
|
TPCDS_RAW_DIR=${WAREHOUSE_EXTERNAL_DIR}/tpcds_raw
|
|
|
|
# Use a marker file to avoid regenerating the data if already present in
|
|
# the warehouse. dsdgen is a serial process and somewhat slow.
|
|
if ! stat ${TPCDS_RAW_DIR}/generated; then
|
|
SCALE_FACTOR=1
|
|
# Generate the data. This creates one .dat file for each table.
|
|
./tpc-ds-${TPCDS_VERSION}/bin/dsdgen -force -verbose -scale ${SCALE_FACTOR}
|
|
|
|
# Move the tables into the warehouse, one per subdirectory
|
|
for FILE in *.dat; do
|
|
FILE_DIR=${TPCDS_RAW_DIR}/${FILE%.dat}
|
|
rm -rf "${FILE_DIR}"
|
|
mkdir -p "${FILE_DIR}"
|
|
mv "${FILE}" "${FILE_DIR}"
|
|
done
|
|
touch ${TPCDS_RAW_DIR}/generated
|
|
fi
|
|
|
|
IMPALA_SHELL="impala-shell --protocol=hs2 -i docker_impalad-1_1"
|
|
|
|
# Wait until Impala comes up (it started in parallel with the data loader).
|
|
for i in $(seq 300); do
|
|
if ${IMPALA_SHELL} -q 'select version()'; then
|
|
break
|
|
fi
|
|
echo "Waiting for impala to come up"
|
|
sleep 0.5
|
|
done
|
|
|
|
${IMPALA_SHELL} -f /opt/impala/sql/load_tpcds_parquet.sql
|
|
# Load data into Kudu if the Kudu master is up.
|
|
if ping -c1 kudu-master-1; then
|
|
${IMPALA_SHELL} -f /opt/impala/sql/load_tpcds_kudu.sql
|
|
fi
|
|
elif [[ "$1" = "impala-shell" ]]; then
|
|
shift
|
|
# Execute impala-shell with any extra arguments provided.
|
|
exec impala-shell --protocol=hs2 --history_file=/tmp/impalahistory \
|
|
-i docker_impalad-1_1 "$@"
|
|
else
|
|
# Execute the provided input as a command
|
|
exec "$@"
|
|
fi
|