#!/bin/bash # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. # # Create the test environment needed by Impala. Includes generation of the # Hadoop config files: core-site.xml, hbase-site.xml, hive-site.xml as well # as creation of the Hive metastore. set -euo pipefail . $IMPALA_HOME/bin/report_build_error.sh setup_report_build_error # Perform search-replace on $1, output to $2. # Search $1 ($GCIN) for strings that look like "${FOO}". If FOO is defined in # the environment then replace "${FOO}" with the environment value. Also # remove or leave special kerberos settings as desired. Sanity check at end. # # NOTE: for Hadoop-style XML configuration files (foo-site.xml) prefer using # bin/generate_xml_config.py instead of this method. This method is useful for # ini-style or other configuration formats. # # TODO(todd): convert remaining 'foo-site.xml' files to use the preferred # mechanism. # # TODO(todd): consider a better Python-based templating system for the other # configuration files as well. function generate_config { GCIN="$1" GCOUT="$2" perl -wpl -e 's/\$\{([^}]+)\}/defined $ENV{$1} ? $ENV{$1} : $&/eg' \ "${GCIN}" > "${GCOUT}.tmp" if [[ "${IMPALA_KERBERIZE}" != "true" ]]; then sed '//d' \ "${GCOUT}.tmp" > "${GCOUT}" else cp "${GCOUT}.tmp" "${GCOUT}" fi rm -f "${GCOUT}.tmp" # Check for anything that might have been missed. # Assumes that environment variables will be ALL CAPS... if grep '\${[A-Z_]*}' "${GCOUT}"; then echo "Found undefined variables in ${GCOUT}, aborting" exit 1 fi echo "Generated `pwd`/${GCOUT}" } CREATE_METASTORE=0 CREATE_RANGER_POLICY_DB=0 UPGRADE_METASTORE_DB=0 # parse command line options for ARG in $* do case "$ARG" in -create_metastore) CREATE_METASTORE=1 ;; -create_ranger_policy_db) CREATE_RANGER_POLICY_DB=1 ;; -upgrade_metastore_db) UPGRADE_METASTORE_DB=1 ;; -help|*) echo "[-create_metastore] : If true, creates a new metastore." echo "[-create_ranger_policy_db] : If true, creates a new Ranger policy db." echo "[-upgrade_metastore_db] : If true, upgrades the schema of HMS db." exit 1 ;; esac done # If this isn't sourced, bad things will always happen if [ "${IMPALA_CONFIG_SOURCED}" != "1" ]; then echo "You must source bin/impala-config.sh" exit 1 fi ${CLUSTER_DIR}/admin create_cluster if [[ "${IMPALA_KERBERIZE}" = "true" ]]; then # Sanity check... if ! ${CLUSTER_DIR}/admin is_kerberized; then echo "Kerberized cluster not created, even though told to." exit 1 fi # Set some more environment variables. . ${MINIKDC_ENV} # For hive-site.xml further down... export HIVE_S2_AUTH=KERBEROS else export HIVE_S2_AUTH=NONE fi export CURRENT_USER=`whoami` CONFIG_DIR=${IMPALA_HOME}/fe/src/test/resources RANGER_TEST_CONF_DIR="${IMPALA_HOME}/testdata/cluster/ranger" echo "Config dir: ${CONFIG_DIR}" echo "Current user: ${CURRENT_USER}" echo "Metastore DB: ${METASTORE_DB}" echo "Ranger DB : ${RANGER_POLICY_DB}" pushd ${CONFIG_DIR} # Cleanup any existing files rm -f {core,hdfs,hbase,hive,ozone,yarn,mapred}-site.xml rm -f authz-provider.ini # Generate hive configs first so that schemaTool can be used to init the metastore schema # if needed # Set IMPALA_JAVA_TOOL_OPTIONS to allow passing it to Tez containers. . $IMPALA_HOME/bin/set-impala-java-tool-options.sh CORE_SITE_VARIANT=disable_block_locations $IMPALA_HOME/bin/generate_xml_config.py \ $IMPALA_HOME/testdata/cluster/node_templates/common/etc/hadoop/conf/core-site.xml.py \ core-site_disabled_block_locations.xml mkdir -p core-site-disabled-block-locations rm -f core-site-disabled-block-locations/core-site.xml ln -s "${CONFIG_DIR}/core-site_disabled_block_locations.xml" \ core-site-disabled-block-locations/core-site.xml $IMPALA_HOME/bin/generate_xml_config.py hive-site.xml.py hive-site.xml export HIVE_VARIANT=changed_external_dir $IMPALA_HOME/bin/generate_xml_config.py hive-site.xml.py hive-site_ext.xml mkdir -p hive-site-ext rm -f hive-site-ext/hive-site.xml ln -s "${CONFIG_DIR}/hive-site_ext.xml" hive-site-ext/hive-site.xml export HIVE_VARIANT=without_hms_config $IMPALA_HOME/bin/generate_xml_config.py hive-site.xml.py hive-site_without_hms.xml mkdir -p hive-site-without-hms rm -f hive-site-without-hms/hive-site.xml ln -s "${CONFIG_DIR}/hive-site_without_hms.xml" hive-site-without-hms/hive-site.xml export HIVE_VARIANT=events_cleanup $IMPALA_HOME/bin/generate_xml_config.py hive-site.xml.py hive-site_events_cleanup.xml mkdir -p hive-site-events-cleanup rm -f hive-site-events-cleanup/hive-site.xml ln -s "${CONFIG_DIR}/hive-site_events_cleanup.xml" hive-site-events-cleanup/hive-site.xml export HIVE_VARIANT=housekeeping_on $IMPALA_HOME/bin/generate_xml_config.py hive-site.xml.py hive-site_housekeeping_on.xml mkdir -p hive-site-housekeeping-on rm -f hive-site-housekeeping-on/hive-site.xml ln -s "${CONFIG_DIR}/hive-site_housekeeping_on.xml" \ hive-site-housekeeping-on/hive-site.xml export HIVE_VARIANT=events_config_change $IMPALA_HOME/bin/generate_xml_config.py hive-site.xml.py hive-site_events_config.xml mkdir -p hive-site-events-config rm -f hive-site-events-config/hive-site.xml ln -s "${CONFIG_DIR}/hive-site_events_config.xml" \ hive-site-events-config/hive-site.xml export HIVE_VARIANT=ranger_auth HIVE_RANGER_CONF_DIR=hive-site-ranger-auth $IMPALA_HOME/bin/generate_xml_config.py hive-site.xml.py hive-site_ranger_auth.xml # Cleanup pycache if created rm -rf __pycache__ rm -rf $HIVE_RANGER_CONF_DIR mkdir -p $HIVE_RANGER_CONF_DIR ln -s "${CONFIG_DIR}/hive-site_ranger_auth.xml" $HIVE_RANGER_CONF_DIR/hive-site.xml # Link some neccessary config files for Hive. for f in ranger-hive-security.xml ranger-hive-audit.xml log4j.properties \ hive-log4j2.properties; do ln -s "${CONFIG_DIR}/$f" "$HIVE_RANGER_CONF_DIR/$f" done generate_config hive-log4j2.properties.template hive-log4j2.properties if [ $CREATE_METASTORE -eq 1 ]; then echo "Creating postgresql database for Hive metastore" dropdb -U hiveuser ${METASTORE_DB} || true createdb -U hiveuser ${METASTORE_DB} # Use schematool to initialize the metastore db schema. It detects the Hive # version and invokes the appropriate scripts CLASSPATH={$CLASSPATH}:${CONFIG_DIR} ${HIVE_HOME}/bin/schematool -initSchema -dbType \ postgres 1>${IMPALA_CLUSTER_LOGS_DIR}/schematool.log 2>&1 # TODO: We probably don't need to do this anymore # Increase the size limit of PARAM_VALUE from SERDE_PARAMS table to be able to create # HBase tables with large number of columns. echo "alter table \"SERDE_PARAMS\" alter column \"PARAM_VALUE\" type character varying" \ | psql -q -U hiveuser -d ${METASTORE_DB} fi if [ $UPGRADE_METASTORE_DB -eq 1 ]; then echo "Upgrading the schema of metastore db ${METASTORE_DB}. Check \ ${IMPALA_CLUSTER_LOGS_DIR}/schematool.log for details." CLASSPATH={$CLASSPATH}:${CONFIG_DIR} ${HIVE_HOME}/bin/schematool -upgradeSchema \ -dbType postgres 1>${IMPALA_CLUSTER_LOGS_DIR}/schematool.log 2>&1 fi if [ $CREATE_RANGER_POLICY_DB -eq 1 ]; then echo "Creating Ranger Policy Server DB" dropdb -U hiveuser "${RANGER_POLICY_DB}" 2> /dev/null || true createdb -U hiveuser "${RANGER_POLICY_DB}" pushd "${RANGER_HOME}" generate_config "${RANGER_TEST_CONF_DIR}/install.properties.template" install.properties python ./db_setup.py popd fi echo "Copying common conf files from local cluster:" CLUSTER_HADOOP_CONF_DIR=$(${CLUSTER_DIR}/admin get_hadoop_client_conf_dir) for file in core-site.xml hdfs-site.xml ozone-site.xml yarn-site.xml ; do echo ... $file # These need to be copied instead of symlinked so that they can be accessed when the # directory is bind-mounted into /opt/impala/conf in docker containers. cp ${CLUSTER_HADOOP_CONF_DIR}/$file . done if [[ "${IMPALA_KERBERIZE}" = "true" ]]; then # KERBEROS TODO: Without this, the yarn daemons can see these # files, but mapreduce jobs *cannot* see these files. This seems # strange, but making these symlinks also results in data loading # failures in the non-kerberized case. Without these, mapreduce # jobs die in a kerberized cluster because they can't find their # kerberos principals. Obviously this has to be sorted out before # a kerberized cluster can load data. echo "Linking yarn and mapred from local cluster" ln -s ${CLUSTER_HADOOP_CONF_DIR}/mapred-site.xml fi generate_config log4j.properties.template log4j.properties generate_config hbase-site.xml.template hbase-site.xml if [[ "${IMPALA_KERBERIZE}" = "true" ]]; then generate_config hbase-jaas-server.conf.template hbase-jaas-server.conf generate_config hbase-jaas-client.conf.template hbase-jaas-client.conf fi popd RANGER_SERVER_CONF_DIR="${RANGER_HOME}/ews/webapp/WEB-INF/classes/conf" RANGER_SERVER_CONFDIST_DIR="${RANGER_HOME}/ews/webapp/WEB-INF/classes/conf.dist" RANGER_SERVER_LIB_DIR="${RANGER_HOME}/ews/webapp/WEB-INF/lib" RANGER_ADMIN_LOGBACK_CONF_FILE="${RANGER_SERVER_CONFDIST_DIR}/logback.xml" RANGER_ADMIN_LOG4J2_CONF_FILE="${RANGER_HOME}/ews/webapp/WEB-INF/log4j2.properties" RANGER_LOG_DIR="${IMPALA_CLUSTER_LOGS_DIR}/ranger" if [[ ! -d "${RANGER_SERVER_CONF_DIR}" ]]; then mkdir -p "${RANGER_SERVER_CONF_DIR}" fi cp -f "${RANGER_TEST_CONF_DIR}/java_home.sh" "${RANGER_SERVER_CONF_DIR}" cp -f "${RANGER_TEST_CONF_DIR}/ranger-admin-env-logdir.sh" "${RANGER_SERVER_CONF_DIR}" cp -f "${RANGER_TEST_CONF_DIR}/ranger-admin-env-piddir.sh" "${RANGER_SERVER_CONF_DIR}" cp -f "${RANGER_SERVER_CONFDIST_DIR}/security-applicationContext.xml" \ "${RANGER_SERVER_CONF_DIR}" # For Apache Ranger, we need logback.xml under ${RANGER_SERVER_CONF_DIR} so that the log # files like ranger-admin-$(hostname)-$(whoami).log could be created under # ${RANGER_LOG_DIR}. if [[ -f ${RANGER_ADMIN_LOGBACK_CONF_FILE} ]]; then cp -f ${RANGER_ADMIN_LOGBACK_CONF_FILE} ${RANGER_SERVER_CONF_DIR} fi # For CDP Ranger, we change the value of the property 'log.dir' in the corresponding # log4j2.properties so that the log files like ranger-admin-server.log could be created # under ${RANGER_LOG_DIR}. if [[ -f ${RANGER_ADMIN_LOG4J2_CONF_FILE} ]]; then # Use vertical bar instead of slash as the separator to prevent the slash(es) in # ${RANGER_LOG_DIR} from interfering with the parsing of sed. sed -i "s|property\.log\.dir=.*|property.log.dir=${RANGER_LOG_DIR}|g" \ ${RANGER_ADMIN_LOG4J2_CONF_FILE} fi # Prepend the following 5 URL's to the line starting with ", # GRANT/REVOKE ROLE TO/FROM GROUP , and SHOW ROLES could work in a # non-Kerberized environment. It is better to add the allowed links using sed than to use # a hardcoded configuration file consisting of those links since some other configurations # could change after CDP_BUILD_NUMBER is bumped up, e.g., the version of jquery. sed -i '/ \ \ \ \ ' \ "${RANGER_SERVER_CONF_DIR}/security-applicationContext.xml" if [[ -f "${POSTGRES_JDBC_DRIVER}" ]]; then cp -f "${POSTGRES_JDBC_DRIVER}" "${RANGER_SERVER_LIB_DIR}" else # IMPALA-8261: Running this script should not fail when FE has not been built. MAVEN_URL="https://repo.maven.apache.org/maven2/org/postgresql/postgresql" JDBC_JAR="postgresql-${IMPALA_POSTGRES_JDBC_DRIVER_VERSION}.jar" wget -P "${RANGER_SERVER_LIB_DIR}" \ "${MAVEN_URL}/${IMPALA_POSTGRES_JDBC_DRIVER_VERSION}/${JDBC_JAR}" fi pushd "${RANGER_SERVER_CONF_DIR}" generate_config "${RANGER_TEST_CONF_DIR}/ranger-admin-default-site.xml.template" \ ranger-admin-default-site.xml generate_config "${RANGER_TEST_CONF_DIR}/ranger-admin-site.xml.template" \ ranger-admin-site.xml popd echo "Completed config generation" # Creates a symlink in TARGET_DIR to all subdirectories under SOURCE_DIR function symlink_subdirs { SOURCE_DIR=$1 TARGET_DIR=$2 if [ -d "${SOURCE_DIR}" ]; then find ${SOURCE_DIR}/ -maxdepth 1 -mindepth 1 -type d -exec ln -f -s {} ${TARGET_DIR} \; else echo "No auxiliary tests found at: ${SOURCE_DIR}" fi } # The Impala test framework support running additional tests outside of the main repo. # This is an optional feature that can be enabled by setting the IMPALA_AUX_* environment # variables to valid locations. echo "Searching for auxiliary tests, workloads, and datasets (if any exist)." symlink_subdirs ${IMPALA_AUX_WORKLOAD_DIR} ${IMPALA_WORKLOAD_DIR} symlink_subdirs ${IMPALA_AUX_DATASET_DIR} ${IMPALA_DATASET_DIR} if [ -d ${IMPALA_AUX_TEST_HOME}/tests/functional ]; then symlink_subdirs ${IMPALA_AUX_TEST_HOME}/tests/functional ${IMPALA_HOME}/tests else # For compatibility with older auxiliary tests, which aren't in the # functional subdirectory. symlink_subdirs ${IMPALA_AUX_TEST_HOME}/tests ${IMPALA_HOME}/tests fi