IMPALA-1567: Ignore 'hidden' files with special suffixes.

Currently, we only consider files hidden if they have the special
prefixes "." or "_". However, some tools use special suffixes
to indicate a file is being operated on, and should be considered
invisible.

This patch adds the following hidden suffixes:
'.tmp' - Flume's default for temp files
'.copying' - hdfs put may produce these

Change-Id: I151eafd0286fa91e062407e12dd71cfddd442430
Reviewed-on: http://gerrit.cloudera.org:8080/80
Reviewed-by: Alex Behm <alex.behm@cloudera.com>
Tested-by: Internal Jenkins
This commit is contained in:
Alex Behm
2015-02-19 12:50:10 -08:00
committed by Internal Jenkins
parent b47d50d263
commit 37ca6b81ae
4 changed files with 118 additions and 2 deletions

View File

@@ -199,8 +199,12 @@ public class FileSystemUtil {
}
public static boolean isHiddenFile(String fileName) {
// Hidden files start with . or _
return fileName.startsWith(".") || fileName.startsWith("_");
// Hidden files start with '.' or '_'. The '.copying' suffix is used by some
// filesystem utilities (e.g. hdfs put) as a temporary destination when copying
// files. The '.tmp' suffix is Flume's default for temporary files.
String lcFileName = fileName.toLowerCase();
return lcFileName.startsWith(".") || lcFileName.startsWith("_") ||
lcFileName.endsWith(".copying") || lcFileName.endsWith(".tmp");
}
/**

View File

@@ -0,0 +1,19 @@
====
---- QUERY
show partitions hidden_files_db.hf
---- LABELS
YEAR, MONTH, #ROWS, #FILES, SIZE, BYTES CACHED, CACHE REPLICATION, FORMAT, INCREMENTAL STATS
---- RESULTS
'2010','1',-1,1,'20.36KB','NOT CACHED','NOT CACHED','TEXT','false'
'2010','2',-1,0,'0B','NOT CACHED','NOT CACHED','TEXT','false'
'Total','',-1,1,'20.36KB','0B','','',''
---- TYPES
STRING, STRING, BIGINT, BIGINT, STRING, STRING, STRING, STRING, STRING
====
---- QUERY
select count(*) from hidden_files_db.hf
---- RESULTS
310
---- TYPES
BIGINT
====

View File

@@ -94,6 +94,8 @@ class TestMatrix(object):
return self.dimensions.has_key(dimension_name)
def generate_test_vectors(self, exploration_strategy):
if not self.dimensions:
return list()
# TODO: Check valid exploration strategies, provide more options for exploration
if exploration_strategy == 'exhaustive':
return self.__generate_exhaustive_combinations()

View File

@@ -0,0 +1,91 @@
# Copyright (c) 2015 Cloudera, Inc. All rights reserved.
import pytest
from tests.common.test_vector import *
from tests.common.impala_test_suite import *
from subprocess import check_call
TEST_DB = 'hidden_files_db'
TEST_TBL = 'hf'
class TestHiddenFiles(ImpalaTestSuite):
"""
Tests that files with special prefixes/suffixes are considered 'hidden' when
loading table metadata and running queries.
"""
@classmethod
def get_workload(self):
return 'functional-query'
@classmethod
def add_test_dimensions(cls):
super(TestHiddenFiles, cls).add_test_dimensions()
cls.TestMatrix.add_dimension(create_single_exec_option_dimension())
cls.TestMatrix.add_dimension(create_uncompressed_text_dimension(cls.get_workload()))
# Only run in exhaustive mode since this test takes a long time.
#if cls.exploration_strategy() != 'exhaustive':
# cls.TestMatrix.clear()
def setup_method(self, method):
self.cleanup_db(TEST_DB)
self.client.execute("create database " + TEST_DB);
self.client.execute(
"create table %s.%s like functional.alltypes" % (TEST_DB, TEST_TBL))
self.client.execute(
"alter table %s.%s add partition (year=2010, month=1)" % (TEST_DB, TEST_TBL))
self.client.execute(
"alter table %s.%s add partition (year=2010, month=2)" % (TEST_DB, TEST_TBL))
self.__populate_test_table()
def teardown_method(self, method):
self.cleanup_db(TEST_DB)
def __populate_test_table(self):
"""Copy files into the HDFS directories of two partitions of the table.
The goal is to have both an empty and non-empty partition with hidden files."""
tbl_loc = "/test-warehouse/%s.db/%s" % (TEST_DB, TEST_TBL)
# Copy a visible file into one of the partitions.
check_call(["hadoop", "fs", "-cp",
"/test-warehouse/alltypes/year=2010/month=1/100101.txt",
"%s/year=2010/month=1/100101.txt" % tbl_loc], shell=False)
# Add hidden files to the non-empty partition. Use upper case hidden suffixes.
check_call(["hadoop", "fs", "-cp",
"/test-warehouse/alltypes/year=2010/month=1/100101.txt",
"%s/year=2010/month=1/.100101.txt" % tbl_loc], shell=False)
check_call(["hadoop", "fs", "-cp",
"/test-warehouse/alltypes/year=2010/month=1/100101.txt",
"%s/year=2010/month=1/_100101.txt" % tbl_loc], shell=False)
check_call(["hadoop", "fs", "-cp",
"/test-warehouse/alltypes/year=2010/month=1/100101.txt",
"%s/year=2010/month=1/100101.txt.COPYING" % tbl_loc], shell=False)
check_call(["hadoop", "fs", "-cp",
"/test-warehouse/alltypes/year=2010/month=1/100101.txt",
"%s/year=2010/month=1/100101.txt.TMP" % tbl_loc], shell=False)
# Add hidden files to the empty partition. Use lower case hidden suffixes.
check_call(["hadoop", "fs", "-cp",
"/test-warehouse/alltypes/year=2010/month=2/100201.txt",
"%s/year=2010/month=2/.100201.txt" % tbl_loc], shell=False)
check_call(["hadoop", "fs", "-cp",
"/test-warehouse/alltypes/year=2010/month=2/100201.txt",
"%s/year=2010/month=2/_100201.txt" % tbl_loc], shell=False)
check_call(["hadoop", "fs", "-cp",
"/test-warehouse/alltypes/year=2010/month=2/100201.txt",
"%s/year=2010/month=2/100201.txt.copying" % tbl_loc], shell=False)
check_call(["hadoop", "fs", "-cp",
"/test-warehouse/alltypes/year=2010/month=2/100201.txt",
"%s/year=2010/month=2/100201.txt.tmp" % tbl_loc], shell=False)
@pytest.mark.execute_serially
def test_hidden_files_load(self, vector):
"""Tests that an incremental refresh ignores hidden files."""
self.client.execute("invalidate metadata %s.hf" % TEST_DB)
self.run_test_case('QueryTest/hidden-files', vector)
@pytest.mark.execute_serially
def test_hidden_files_refresh(self, vector):
"""Tests that an incremental refresh ignores hidden files."""
self.client.execute("refresh %s.hf" % TEST_DB)
self.run_test_case('QueryTest/hidden-files', vector)