mirror of
https://github.com/apache/impala.git
synced 2025-12-19 18:12:08 -05:00
Reverts support for o3fs as a default filesystem added in IMPALA-9442. Updates test setup to use ofs instead. Munges absolute paths in Iceberg metadata to match the new location required for ofs. Ozone has strict requirements on volume and bucket names, so all tables must be created within a bucket (e.g. inside /impala/test-warehouse/). Change-Id: I45e90d30b2e68876dec0db3c43ac15ee510b17bd Reviewed-on: http://gerrit.cloudera.org:8080/19001 Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
178 lines
7.1 KiB
Python
178 lines
7.1 KiB
Python
# Licensed to the Apache Software Foundation (ASF) under one
|
|
# or more contributor license agreements. See the NOTICE file
|
|
# distributed with this work for additional information
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
# to you under the Apache License, Version 2.0 (the
|
|
# "License"); you may not use this file except in compliance
|
|
# with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
# This module contains utility functions for testing Parquet files,
|
|
# and other functions used for checking for strings in files and
|
|
# directories.
|
|
|
|
import os
|
|
import re
|
|
import tempfile
|
|
from subprocess import check_call
|
|
|
|
from tests.util.filesystem_utils import get_fs_path, WAREHOUSE_PREFIX
|
|
|
|
|
|
def create_iceberg_table_from_directory(impala_client, unique_database, table_name,
|
|
file_format):
|
|
"""Utility function to create an iceberg table from a directory. The directory must
|
|
exist in $IMPALA_HOME/testdata/data/iceberg_test with the name 'table_name'"""
|
|
|
|
# Only orc and parquet tested/supported
|
|
assert file_format == "orc" or file_format == "parquet"
|
|
|
|
local_dir = os.path.join(
|
|
os.environ['IMPALA_HOME'], 'testdata/data/iceberg_test/{0}'.format(table_name))
|
|
assert os.path.isdir(local_dir)
|
|
|
|
# If using a prefix, rewrite iceberg metadata to use the prefix
|
|
if WAREHOUSE_PREFIX:
|
|
tmp_dir = tempfile.mktemp(table_name)
|
|
check_call(['cp', '-r', local_dir, tmp_dir])
|
|
rewrite = os.path.join(
|
|
os.environ['IMPALA_HOME'], 'testdata/bin/rewrite-iceberg-metadata.py')
|
|
check_call([rewrite, WAREHOUSE_PREFIX, os.path.join(tmp_dir, 'metadata')])
|
|
local_dir = tmp_dir
|
|
|
|
# Put the directory in the database's directory (not the table directory)
|
|
hdfs_parent_dir = get_fs_path("/test-warehouse")
|
|
|
|
hdfs_dir = os.path.join(hdfs_parent_dir, table_name)
|
|
|
|
# Purge existing files if any
|
|
check_call(['hdfs', 'dfs', '-rm', '-f', '-r', hdfs_dir])
|
|
|
|
# Note: -d skips a staging copy
|
|
check_call(['hdfs', 'dfs', '-put', '-d', local_dir, hdfs_dir])
|
|
|
|
# Create external table
|
|
qualified_table_name = '{0}.{1}'.format(unique_database, table_name)
|
|
impala_client.execute("""create external table {0} stored as iceberg location '{1}'
|
|
tblproperties('write.format.default'='{2}', 'iceberg.catalog'=
|
|
'hadoop.tables')""".format(qualified_table_name, hdfs_dir,
|
|
file_format))
|
|
|
|
# Automatic clean up after drop table
|
|
impala_client.execute("""alter table {0} set tblproperties ('external.table.purge'=
|
|
'True');""".format(qualified_table_name))
|
|
|
|
def create_table_from_parquet(impala_client, unique_database, table_name):
|
|
"""Utility function to create a database table from a Parquet file. A Parquet file must
|
|
exist in $IMPALA_HOME/testdata/data with the name 'table_name'.parquet"""
|
|
create_table_from_file(impala_client, unique_database, table_name, 'parquet')
|
|
|
|
|
|
def create_table_from_orc(impala_client, unique_database, table_name):
|
|
"""Utility function to create a database table from a Orc file. A Orc file must
|
|
exist in $IMPALA_HOME/testdata/data with the name 'table_name'.orc"""
|
|
create_table_from_file(impala_client, unique_database, table_name, 'orc')
|
|
|
|
|
|
def create_table_from_file(impala_client, unique_database, table_name, file_format):
|
|
filename = '{0}.{1}'.format(table_name, file_format)
|
|
local_file = os.path.join(os.environ['IMPALA_HOME'],
|
|
'testdata/data/{0}'.format(filename))
|
|
assert os.path.isfile(local_file)
|
|
|
|
# Put the file in the database's directory (not the table directory) so it is
|
|
# available for a LOAD DATA statement.
|
|
hdfs_file = get_fs_path(
|
|
os.path.join("/test-warehouse", "{0}.db".format(unique_database), filename))
|
|
# Note: -d skips a staging copy
|
|
check_call(['hdfs', 'dfs', '-put', '-f', '-d', local_file, hdfs_file])
|
|
|
|
# Create the table and load the file
|
|
qualified_table_name = '{0}.{1}'.format(unique_database, table_name)
|
|
impala_client.execute('create table {0} like {1} "{2}" stored as {1}'.format(
|
|
qualified_table_name, file_format, hdfs_file))
|
|
impala_client.execute('load data inpath "{0}" into table {1}'.format(
|
|
hdfs_file, qualified_table_name))
|
|
|
|
|
|
def create_table_and_copy_files(impala_client, create_stmt, unique_database, table_name,
|
|
files):
|
|
# Create the table
|
|
create_stmt = create_stmt.format(db=unique_database, tbl=table_name)
|
|
impala_client.execute(create_stmt)
|
|
|
|
hdfs_dir = get_fs_path(
|
|
os.path.join("/test-warehouse", unique_database + ".db", table_name))
|
|
copy_files_to_hdfs_dir(files, hdfs_dir)
|
|
|
|
# Refresh the table metadata to see the new files
|
|
refresh_stmt = "refresh {0}.{1}".format(unique_database, table_name)
|
|
impala_client.execute(refresh_stmt)
|
|
|
|
|
|
def copy_files_to_hdfs_dir(files, hdfs_dir):
|
|
# Copy the files
|
|
# - build a list of source files
|
|
# - issue a single put to the hdfs_dir ( -d skips a staging copy)
|
|
source_files = []
|
|
for local_file in files:
|
|
# Cut off leading '/' to make os.path.join() happy
|
|
local_file = local_file if local_file[0] != '/' else local_file[1:]
|
|
local_file = os.path.join(os.environ['IMPALA_HOME'], local_file)
|
|
assert os.path.isfile(local_file)
|
|
source_files.append(local_file)
|
|
check_call(['hdfs', 'dfs', '-put', '-f', '-d'] + source_files + [hdfs_dir])
|
|
|
|
|
|
def grep_dir(dir, search, filename_search=""):
|
|
'''Recursively search for files that contain 'search' and have a filename that matches
|
|
'filename_search' and return a list of matched lines grouped by file.
|
|
'''
|
|
filename_matcher = re.compile(filename_search)
|
|
matching_files = dict()
|
|
for dir_name, _, file_names in os.walk(dir):
|
|
for file_name in file_names:
|
|
file_path = os.path.join(dir_name, file_name)
|
|
if os.path.islink(file_path) or not filename_matcher.search(file_path):
|
|
continue
|
|
with open(file_path) as file:
|
|
matching_lines = grep_file(file, search)
|
|
if matching_lines:
|
|
matching_files[file_name] = matching_lines
|
|
return matching_files
|
|
|
|
|
|
def grep_file(file, search):
|
|
'''Return lines in 'file' that match the 'search' regex. 'file' must already be
|
|
opened.
|
|
'''
|
|
matcher = re.compile(search)
|
|
matching_lines = list()
|
|
for line in file:
|
|
if matcher.search(line):
|
|
matching_lines.append(line)
|
|
return matching_lines
|
|
|
|
|
|
def assert_file_in_dir_contains(dir, search):
|
|
'''Asserts that at least one file in the 'dir' contains the 'search' term.'''
|
|
results = grep_dir(dir, search)
|
|
assert results, "%s should have a file containing '%s' but no file was found" \
|
|
% (dir, search)
|
|
|
|
|
|
def assert_no_files_in_dir_contain(dir, search):
|
|
'''Asserts that no files in the 'dir' contains the 'search' term.'''
|
|
results = grep_dir(dir, search)
|
|
assert not results, \
|
|
"%s should not have any file containing '%s' but a file was found" \
|
|
% (dir, search)
|