Files
impala/tests/common/file_utils.py
Joe McDonnell f1a3c47959 IMPALA-7804: Mitigate s3 consistency issues for test_scanners
test_scanners.py has seen several flaky failures on
s3 due to eventual consistency. The symptom is Impala
being unable to read a file that it just loaded to s3.

A large number of tables used in test_scanners.py
use the file_utils helper functions for creating
the tables. These follow the pattern:
1. Copy files to temporary directory in HDFS/S3/etc
2. Create table
3. Run LOAD DATA to move the files to the table

In step #3, LOAD DATA gets the metadata for the
table before it runs the move statement on the
files. Subsequent queries on the table will not
need to reload metadata and can access the file
quickly after the move.

This changes the ordering to put the files in place
before loading metadata. This may improve the
likelihood that the filesystem is consistent by
the time we read it. Specifically, we now do:
1. Put the files in directory that the table
   will use when it is created.
2. Create table
Neither of these steps load metadata, so the next
query that runs will load metadata.

Change-Id: Id042496beabe0d0226b347e0653b820fee369f4e
Reviewed-on: http://gerrit.cloudera.org:8080/11959
Reviewed-by: Joe McDonnell <joemcdonnell@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
2018-11-30 01:58:52 +00:00

115 lines
4.3 KiB
Python

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# This module contains utility functions for testing Parquet files,
# and other functions used for checking for strings in files and
# directories.
import os
from subprocess import check_call
from tests.util.filesystem_utils import get_fs_path
def create_table_from_parquet(impala_client, unique_database, table_name):
"""Utility function to create a database table from a Parquet file. A Parquet file must
exist in $IMPALA_HOME/testdata/data with the name 'table_name'.parquet"""
filename = '{0}.parquet'.format(table_name)
local_file = os.path.join(os.environ['IMPALA_HOME'],
'testdata/data/{0}'.format(filename))
assert os.path.isfile(local_file)
# The table doesn't exist, so create the table's directory
tbl_dir = get_fs_path('/test-warehouse/{0}.db/{1}'.format(unique_database, table_name))
check_call(['hdfs', 'dfs', '-mkdir', '-p', tbl_dir])
# Put the parquet file in the table's directory
# Note: -d skips a staging copy
check_call(['hdfs', 'dfs', '-put', '-f', '-d', local_file, tbl_dir])
# Create the table
hdfs_file = '{0}/{1}'.format(tbl_dir, filename)
qualified_table_name = '{0}.{1}'.format(unique_database, table_name)
impala_client.execute('create table {0} like parquet "{1}" stored as parquet'.format(
qualified_table_name, hdfs_file))
def create_table_and_copy_files(impala_client, create_stmt, unique_database, table_name,
files):
# Create the directory
hdfs_dir = get_fs_path('/test-warehouse/{0}.db/{1}'.format(unique_database, table_name))
check_call(['hdfs', 'dfs', '-mkdir', '-p', hdfs_dir])
# Copy the files
# - build a list of source files
# - issue a single put to the hdfs_dir ( -d skips a staging copy)
source_files = []
for local_file in files:
# Cut off leading '/' to make os.path.join() happy
local_file = local_file if local_file[0] != '/' else local_file[1:]
local_file = os.path.join(os.environ['IMPALA_HOME'], local_file)
assert os.path.isfile(local_file)
source_files.append(local_file)
check_call(['hdfs', 'dfs', '-put', '-f', '-d'] + source_files + [hdfs_dir])
# Create the table
create_stmt = create_stmt.format(db=unique_database, tbl=table_name)
impala_client.execute(create_stmt)
def grep_dir(dir, search):
'''Recursively search for files that contain 'search' and return a list of matched
lines grouped by file.
'''
matching_files = dict()
for dir_name, _, file_names in os.walk(dir):
for file_name in file_names:
file_path = os.path.join(dir_name, file_name)
if os.path.islink(file_path):
continue
with open(file_path) as file:
matching_lines = grep_file(file, search)
if matching_lines:
matching_files[file_name] = matching_lines
return matching_files
def grep_file(file, search):
'''Return lines in 'file' that contain the 'search' term. 'file' must already be
opened.
'''
matching_lines = list()
for line in file:
if search in line:
matching_lines.append(line)
return matching_lines
def assert_file_in_dir_contains(dir, search):
'''Asserts that at least one file in the 'dir' contains the 'search' term.'''
results = grep_dir(dir, search)
assert results, "%s should have a file containing '%s' but no file was found" \
% (dir, search)
def assert_no_files_in_dir_contain(dir, search):
'''Asserts that no files in the 'dir' contains the 'search' term.'''
results = grep_dir(dir, search)
assert not results, \
"%s should not have any file containing '%s' but a file was found" \
% (dir, search)