mirror of
https://github.com/apache/impala.git
synced 2026-01-06 06:01:03 -05:00
PARQUET_FILE_SIZE query option doesn't work with ADLS because the AdlFileSystem doesn't have a notion of block sizes. And impala depends on the filesystem remembering the block size which is then used as the target parquet file size (this is done for Hdfs so that the parquet file size and block size match even if the parquet_file_size isn't a valid blocksize). We special case for Adls just like we do for S3 to bypass the FileSystem block size, and instead just use the requested PARQUET_FILE_SIZE as the output partitions block_size (and consequently the parquet file target size). Testing: Re-enabled test_insert_parquet_verify_size() for ADLS. Also fixed a miscellaneous bug with the ADLS client listing helper function. Change-Id: I474a913b0ff9b2709f397702b58cb1c74251c25b Reviewed-on: http://gerrit.cloudera.org:8080/7018 Reviewed-by: Sailesh Mukil <sailesh@cloudera.com> Tested-by: Impala Public Jenkins
78 lines
2.9 KiB
Python
78 lines
2.9 KiB
Python
# Licensed to the Apache Software Foundation (ASF) under one
|
|
# or more contributor license agreements. See the NOTICE file
|
|
# distributed with this work for additional information
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
# to you under the Apache License, Version 2.0 (the
|
|
# "License"); you may not use this file except in compliance
|
|
# with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
#
|
|
# ADLS access utilities
|
|
#
|
|
# This file uses the azure-data-lake-store-python client and provides simple
|
|
# functions to the Impala test suite to access Azure Data Lake Store.
|
|
|
|
from azure.datalake.store import core, lib, multithread, exceptions
|
|
from tests.util.filesystem_base import BaseFilesystem
|
|
from tests.util.filesystem_utils import ADLS_CLIENT_ID, ADLS_TENANT_ID, ADLS_CLIENT_SECRET
|
|
|
|
class ADLSClient(BaseFilesystem):
|
|
|
|
def __init__(self, store):
|
|
self.token = lib.auth(tenant_id = ADLS_TENANT_ID,
|
|
client_secret = ADLS_CLIENT_SECRET,
|
|
client_id = ADLS_CLIENT_ID)
|
|
self.adlsclient = core.AzureDLFileSystem(self.token, store_name=store)
|
|
|
|
def create_file(self, path, file_data, overwrite=True):
|
|
if not overwrite and self.exists(path): return False
|
|
with self.adlsclient.open(path, 'wb') as f:
|
|
num_bytes = f.write(file_data)
|
|
assert num_bytes == len(file_data), "ADLS write failed."
|
|
return True
|
|
|
|
def make_dir(self, path, permission=None):
|
|
self.adlsclient.mkdir(path)
|
|
return True
|
|
|
|
def copy(self, src, dst):
|
|
# The ADLS Python client doesn't support cp() yet, so we have to download and
|
|
# reupload to the destination.
|
|
src_contents = self.adlsclient.cat(src)
|
|
self.create_file(dst, src_contents, overwrite=True)
|
|
assert self.exists(dst), \
|
|
'ADLS copy failed: Destination file {dst} does not exist'.format(dst=dst)
|
|
|
|
def ls(self, path):
|
|
file_paths = self.adlsclient.ls(path)
|
|
files= []
|
|
for f in file_paths:
|
|
fname = f.split("/")[-1]
|
|
if not fname == '':
|
|
files += [fname]
|
|
return files
|
|
|
|
def exists(self, path):
|
|
return self.adlsclient.exists(path)
|
|
|
|
def delete_file_dir(self, path, recursive=False):
|
|
try:
|
|
self.adlsclient.rm(path, recursive)
|
|
except exceptions.FileNotFoundError as e:
|
|
return False
|
|
return True
|
|
|
|
def get_all_file_sizes(self, path):
|
|
"""Returns a list of integers which are all the file sizes of files found under
|
|
'path'."""
|
|
return [self.adlsclient.info(f)['length'] for f in self.adlsclient.ls(path) \
|
|
if self.adlsclient.info(f)['type'] == 'FILE']
|