#!/bin/bash ################################################################################ # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ################################################################################ # # This script follows the pattern described in the docker best practices here: # https://docs.docker.com/develop/develop-images/dockerfile_best-practices/#entrypoint ################################################################################ set -euo pipefail set -x if [[ $# -eq 0 ]]; then echo "Must provide at least one argument." exit 1 elif [[ "$1" = "load_tpcds" ]]; then echo "Loading TPC-DS data" IMPALA_TOOLCHAIN_BASE=https://native-toolchain.s3.amazonaws.com/build/7-f2ddef91e9/ TPCDS_VERSION=2.1.0 TPCDS_TARBALL=tpc-ds-${TPCDS_VERSION}-gcc-4.9.2-ec2-package-ubuntu-18-04.tar.gz TPCDS_URL=${IMPALA_TOOLCHAIN_BASE}tpc-ds/${TPCDS_VERSION}-gcc-4.9.2/${TPCDS_TARBALL} curl ${TPCDS_URL} --output tpcds.tar.gz tar xzf tpcds.tar.gz # The base directory for Hive external tables, in a mounted volume. WAREHOUSE_EXTERNAL_DIR=/user/hive/warehouse/external TPCDS_RAW_DIR=${WAREHOUSE_EXTERNAL_DIR}/tpcds_raw # Use a marker file to avoid regenerating the data if already present in # the warehouse. dsdgen is a serial process and somewhat slow. if ! stat ${TPCDS_RAW_DIR}/generated; then SCALE_FACTOR=1 # Generate the data. This creates one .dat file for each table. ./tpc-ds-${TPCDS_VERSION}/bin/dsdgen -force -verbose -scale ${SCALE_FACTOR} # Move the tables into the warehouse, one per subdirectory for FILE in *.dat; do FILE_DIR=${TPCDS_RAW_DIR}/${FILE%.dat} rm -rf "${FILE_DIR}" mkdir -p "${FILE_DIR}" mv "${FILE}" "${FILE_DIR}" done touch ${TPCDS_RAW_DIR}/generated fi IMPALA_SHELL="impala-shell --protocol=hs2 -i docker_impalad-1_1" # Wait until Impala comes up (it started in parallel with the data loader). for i in $(seq 300); do if ${IMPALA_SHELL} -q 'select version()'; then break fi echo "Waiting for impala to come up" sleep 0.5 done ${IMPALA_SHELL} -f /opt/impala/sql/load_tpcds_parquet.sql # Load data into Kudu if the Kudu master is up. if ping -c1 kudu-master-1; then ${IMPALA_SHELL} -f /opt/impala/sql/load_tpcds_kudu.sql fi elif [[ "$1" = "impala-shell" ]]; then shift # Execute impala-shell with any extra arguments provided. exec impala-shell --protocol=hs2 --history_file=/tmp/impalahistory \ -i docker_impalad-1_1 "$@" else # Execute the provided input as a command exec "$@" fi