mirror of
https://github.com/apache/impala.git
synced 2025-12-22 11:28:09 -05:00
This patch updates Impala explain plans so that the Scan Node section clearly
displays which filesystems the Scan Node is reading data from (support
has been added for scans from HDFS, S3, ADLS, and the local filesystem).
Before this patch, if an Impala query scanned a table with partitions
across different storage layers, the explain plan would look like this:
PLAN-ROOT SINK
|
01:EXCHANGE [UNPARTITIONED]
|
00:SCAN HDFS [functional.alltypes]
partitions=24/24 files=24 size=478.45KB
Now the explain plan will look like this:
PLAN-ROOT SINK
|
01:EXCHANGE [UNPARTITIONED]
|
00:SCAN S3 [functional.alltypes]
ADLS partitions=4/24 files=4 size=478.45KB
HDFS partitions=10/24 files=10 size=478.45KB
S3 partitions=10/24 files=10 size=478.45KB
The explain plan differentiates "SCAN HDFS" vs "SCAN S3" by using the
root table path. This means that even scans of non-partitioned tables
will see their explain plans change from "SCAN HDFS" to "SCAN
[storage-layer-name]". This change affects explain plans that are stored on
an single storage layer as well: 'partitions=...' will become
'HDFS partitions-...'.
This patch makes several changes to PlannerTest.java so that by default
test files do not validate the value of the storage layer displayed in
the explain plan. This is necessary to support classes such as
S3PlannerTest which run test files against S3. It makes several changes
to impala_test_suite.py as well in order to support validation of
explain plans in test files that run via Python. Specifically, it adds
support for a new substitution variable in test files called
$FILESYSTEM_NAME which is the name of the storage layer the test is
being run against.
Testing:
* Ran core tests
* Added new tests to PlannerTest
* Added ExplainTest to allow for more fine-grained testing of explain
plan logic
Change-Id: I4b1b4a1bc1a24e9614e3b4dc5a61dc96d075d1c3
Reviewed-on: http://gerrit.cloudera.org:8080/12282
Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
81 lines
3.0 KiB
Python
81 lines
3.0 KiB
Python
# Licensed to the Apache Software Foundation (ASF) under one
|
|
# or more contributor license agreements. See the NOTICE file
|
|
# distributed with this work for additional information
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
# to you under the Apache License, Version 2.0 (the
|
|
# "License"); you may not use this file except in compliance
|
|
# with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
#
|
|
# Utilities for supporting different filesystems.
|
|
import os
|
|
|
|
# FILESYSTEM_PREFIX is the path prefix that should be used in queries. When running
|
|
# the tests against the default filesystem (fs.defaultFS), FILESYSTEM_PREFIX is the
|
|
# empty string. When running against a secondary filesystem, it will be the scheme
|
|
# and authority portion of the qualified path.
|
|
FILESYSTEM_PREFIX = os.getenv("FILESYSTEM_PREFIX") or str()
|
|
SECONDARY_FILESYSTEM = os.getenv("SECONDARY_FILESYSTEM") or str()
|
|
FILESYSTEM = os.getenv("TARGET_FILESYSTEM")
|
|
IS_S3 = FILESYSTEM == "s3"
|
|
IS_ISILON = FILESYSTEM == "isilon"
|
|
IS_LOCAL = FILESYSTEM == "local"
|
|
IS_HDFS = FILESYSTEM == "hdfs"
|
|
IS_ADLS = FILESYSTEM == "adls"
|
|
IS_ABFS = FILESYSTEM == "abfs"
|
|
IS_EC = os.getenv("ERASURE_CODING") == "true"
|
|
# This condition satisfies both the states where one can assume a default fs
|
|
# - The environment variable is set to an empty string.
|
|
# - Tne environment variables is unset ( None )
|
|
# When the local filesystem is used, it should always be the default filesystem.
|
|
IS_DEFAULT_FS = not FILESYSTEM_PREFIX or IS_LOCAL
|
|
|
|
# Isilon specific values.
|
|
ISILON_NAMENODE = os.getenv("ISILON_NAMENODE") or str()
|
|
ISILON_WEBHDFS_PORT = 8082
|
|
|
|
# S3 specific values
|
|
S3_BUCKET_NAME = os.getenv("S3_BUCKET")
|
|
|
|
# ADLS / ABFS specific values
|
|
ABFS_ACCOUNT_NAME = os.getenv("azure_storage_account_name")
|
|
ABFS_CONTAINER_NAME = os.getenv("azure_storage_container_name")
|
|
ADLS_STORE_NAME = os.getenv("azure_data_lake_store_name")
|
|
ADLS_CLIENT_ID = os.getenv("azure_client_id")
|
|
ADLS_TENANT_ID = os.getenv("azure_tenant_id")
|
|
ADLS_CLIENT_SECRET = os.getenv("azure_client_secret")
|
|
|
|
# A map of FILESYSTEM values to their corresponding Scan Node types
|
|
fs_to_name = {'s3': 'S3', 'hdfs': 'HDFS', 'local': 'LOCAL', 'adls': 'ADLS',
|
|
'abfs': 'ADLS'}
|
|
|
|
|
|
def get_fs_name(fs):
|
|
"""Given the target filesystem, return the name of the associated storage layer"""
|
|
return fs_to_name[fs]
|
|
|
|
|
|
def prepend_with_fs(fs, path):
|
|
"""Prepend 'path' with 'fs' if it's not already the prefix."""
|
|
return path if path.startswith(fs) else "%s%s" % (fs, path)
|
|
|
|
|
|
def get_fs_path(path):
|
|
return prepend_with_fs(FILESYSTEM_PREFIX, path)
|
|
|
|
|
|
def get_secondary_fs_path(path):
|
|
return prepend_with_fs(SECONDARY_FILESYSTEM, path)
|
|
|
|
|
|
WAREHOUSE = get_fs_path('/test-warehouse')
|
|
FILESYSTEM_NAME = get_fs_name(FILESYSTEM)
|