Files
impala/tests/util/filesystem_utils.py
Joe McDonnell 6b09612e76 IMPALA-8344: Add support for running the minicluster with S3Guard
Some tests can fail on S3 due to some operations that are eventually
consistent. S3Guard stores extra metadata in a DynamoDB to solve
several consistency issues.

This adds support for running the minicluster on S3 with S3Guard.
S3Guard is configured by the following environment variables:
S3GUARD_ENABLED: defaults to false, set to true to enable S3Guard
S3GUARD_DYNAMODB_TABLE: name of the DynamoDB table to use. This must
  be exclusively owned by this minicluster. The dataload scripts
  initialize this table and will purge entries if the table already
  exists. The table should be in the same region as the S3_BUCKET
  for the minicluster.
S3GUARD_DYNAMODB_REGION - AWS region for S3GUARD_DYNAMODB_TABLE
These environment variables only impact S3 configurations.

The support comes from three pieces:
1. Configuration changes in core-site.xml to add the appropriate
   parameters.
2. Updating dataload to initialize/purge the s3guard dynamodb table
   and import data appropriately.
3. Update tests to manipulate files through the HDFS command line
   rather than through s3 utilities. This takes the filesystem
   utility code for ABFS (which actually calls HDFS command line),
   makes it generic, and uses it for S3Guard.

Testing:
 - Ran multiple rounds of s3 tests
 - Aborted tests in the middle and restarted the s3 tests (to test
   the s3guard reinitialization code)

Change-Id: I3c748529a494bb6e70fec96dc031523ff79bf61d
Reviewed-on: http://gerrit.cloudera.org:8080/13020
Reviewed-by: Joe McDonnell <joemcdonnell@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
Reviewed-by: Sahil Takiar <stakiar@cloudera.com>
2019-05-23 18:25:46 +00:00

82 lines
3.1 KiB
Python

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
# Utilities for supporting different filesystems.
import os
# FILESYSTEM_PREFIX is the path prefix that should be used in queries. When running
# the tests against the default filesystem (fs.defaultFS), FILESYSTEM_PREFIX is the
# empty string. When running against a secondary filesystem, it will be the scheme
# and authority portion of the qualified path.
FILESYSTEM_PREFIX = os.getenv("FILESYSTEM_PREFIX") or str()
SECONDARY_FILESYSTEM = os.getenv("SECONDARY_FILESYSTEM") or str()
FILESYSTEM = os.getenv("TARGET_FILESYSTEM")
IS_S3 = FILESYSTEM == "s3"
IS_ISILON = FILESYSTEM == "isilon"
IS_LOCAL = FILESYSTEM == "local"
IS_HDFS = FILESYSTEM == "hdfs"
IS_ADLS = FILESYSTEM == "adls"
IS_ABFS = FILESYSTEM == "abfs"
IS_EC = os.getenv("ERASURE_CODING") == "true"
# This condition satisfies both the states where one can assume a default fs
# - The environment variable is set to an empty string.
# - Tne environment variables is unset ( None )
# When the local filesystem is used, it should always be the default filesystem.
IS_DEFAULT_FS = not FILESYSTEM_PREFIX or IS_LOCAL
# Isilon specific values.
ISILON_NAMENODE = os.getenv("ISILON_NAMENODE") or str()
ISILON_WEBHDFS_PORT = 8082
# S3 specific values
S3_BUCKET_NAME = os.getenv("S3_BUCKET")
S3GUARD_ENABLED = os.getenv("S3GUARD_ENABLED") == "true"
# ADLS / ABFS specific values
ABFS_ACCOUNT_NAME = os.getenv("azure_storage_account_name")
ABFS_CONTAINER_NAME = os.getenv("azure_storage_container_name")
ADLS_STORE_NAME = os.getenv("azure_data_lake_store_name")
ADLS_CLIENT_ID = os.getenv("azure_client_id")
ADLS_TENANT_ID = os.getenv("azure_tenant_id")
ADLS_CLIENT_SECRET = os.getenv("azure_client_secret")
# A map of FILESYSTEM values to their corresponding Scan Node types
fs_to_name = {'s3': 'S3', 'hdfs': 'HDFS', 'local': 'LOCAL', 'adls': 'ADLS',
'abfs': 'ADLS'}
def get_fs_name(fs):
"""Given the target filesystem, return the name of the associated storage layer"""
return fs_to_name[fs]
def prepend_with_fs(fs, path):
"""Prepend 'path' with 'fs' if it's not already the prefix."""
return path if path.startswith(fs) else "%s%s" % (fs, path)
def get_fs_path(path):
return prepend_with_fs(FILESYSTEM_PREFIX, path)
def get_secondary_fs_path(path):
return prepend_with_fs(SECONDARY_FILESYSTEM, path)
WAREHOUSE = get_fs_path('/test-warehouse')
FILESYSTEM_NAME = get_fs_name(FILESYSTEM)