mirror of
https://github.com/apache/impala.git
synced 2025-12-25 02:03:09 -05:00
IMPALA-1567: Ignore 'hidden' files with special suffixes.
Currently, we only consider files hidden if they have the special prefixes "." or "_". However, some tools use special suffixes to indicate a file is being operated on, and should be considered invisible. This patch adds the following hidden suffixes: '.tmp' - Flume's default for temp files '.copying' - hdfs put may produce these Change-Id: I151eafd0286fa91e062407e12dd71cfddd442430 Reviewed-on: http://gerrit.cloudera.org:8080/80 Reviewed-by: Alex Behm <alex.behm@cloudera.com> Tested-by: Internal Jenkins
This commit is contained in:
committed by
Internal Jenkins
parent
b47d50d263
commit
37ca6b81ae
@@ -199,8 +199,12 @@ public class FileSystemUtil {
|
||||
}
|
||||
|
||||
public static boolean isHiddenFile(String fileName) {
|
||||
// Hidden files start with . or _
|
||||
return fileName.startsWith(".") || fileName.startsWith("_");
|
||||
// Hidden files start with '.' or '_'. The '.copying' suffix is used by some
|
||||
// filesystem utilities (e.g. hdfs put) as a temporary destination when copying
|
||||
// files. The '.tmp' suffix is Flume's default for temporary files.
|
||||
String lcFileName = fileName.toLowerCase();
|
||||
return lcFileName.startsWith(".") || lcFileName.startsWith("_") ||
|
||||
lcFileName.endsWith(".copying") || lcFileName.endsWith(".tmp");
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
19
testdata/workloads/functional-query/queries/QueryTest/hidden-files.test
vendored
Normal file
19
testdata/workloads/functional-query/queries/QueryTest/hidden-files.test
vendored
Normal file
@@ -0,0 +1,19 @@
|
||||
====
|
||||
---- QUERY
|
||||
show partitions hidden_files_db.hf
|
||||
---- LABELS
|
||||
YEAR, MONTH, #ROWS, #FILES, SIZE, BYTES CACHED, CACHE REPLICATION, FORMAT, INCREMENTAL STATS
|
||||
---- RESULTS
|
||||
'2010','1',-1,1,'20.36KB','NOT CACHED','NOT CACHED','TEXT','false'
|
||||
'2010','2',-1,0,'0B','NOT CACHED','NOT CACHED','TEXT','false'
|
||||
'Total','',-1,1,'20.36KB','0B','','',''
|
||||
---- TYPES
|
||||
STRING, STRING, BIGINT, BIGINT, STRING, STRING, STRING, STRING, STRING
|
||||
====
|
||||
---- QUERY
|
||||
select count(*) from hidden_files_db.hf
|
||||
---- RESULTS
|
||||
310
|
||||
---- TYPES
|
||||
BIGINT
|
||||
====
|
||||
@@ -94,6 +94,8 @@ class TestMatrix(object):
|
||||
return self.dimensions.has_key(dimension_name)
|
||||
|
||||
def generate_test_vectors(self, exploration_strategy):
|
||||
if not self.dimensions:
|
||||
return list()
|
||||
# TODO: Check valid exploration strategies, provide more options for exploration
|
||||
if exploration_strategy == 'exhaustive':
|
||||
return self.__generate_exhaustive_combinations()
|
||||
|
||||
91
tests/metadata/test_hidden_files.py
Normal file
91
tests/metadata/test_hidden_files.py
Normal file
@@ -0,0 +1,91 @@
|
||||
# Copyright (c) 2015 Cloudera, Inc. All rights reserved.
|
||||
|
||||
import pytest
|
||||
from tests.common.test_vector import *
|
||||
from tests.common.impala_test_suite import *
|
||||
from subprocess import check_call
|
||||
|
||||
TEST_DB = 'hidden_files_db'
|
||||
TEST_TBL = 'hf'
|
||||
|
||||
class TestHiddenFiles(ImpalaTestSuite):
|
||||
"""
|
||||
Tests that files with special prefixes/suffixes are considered 'hidden' when
|
||||
loading table metadata and running queries.
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
def get_workload(self):
|
||||
return 'functional-query'
|
||||
|
||||
@classmethod
|
||||
def add_test_dimensions(cls):
|
||||
super(TestHiddenFiles, cls).add_test_dimensions()
|
||||
cls.TestMatrix.add_dimension(create_single_exec_option_dimension())
|
||||
cls.TestMatrix.add_dimension(create_uncompressed_text_dimension(cls.get_workload()))
|
||||
# Only run in exhaustive mode since this test takes a long time.
|
||||
#if cls.exploration_strategy() != 'exhaustive':
|
||||
# cls.TestMatrix.clear()
|
||||
|
||||
def setup_method(self, method):
|
||||
self.cleanup_db(TEST_DB)
|
||||
self.client.execute("create database " + TEST_DB);
|
||||
self.client.execute(
|
||||
"create table %s.%s like functional.alltypes" % (TEST_DB, TEST_TBL))
|
||||
self.client.execute(
|
||||
"alter table %s.%s add partition (year=2010, month=1)" % (TEST_DB, TEST_TBL))
|
||||
self.client.execute(
|
||||
"alter table %s.%s add partition (year=2010, month=2)" % (TEST_DB, TEST_TBL))
|
||||
self.__populate_test_table()
|
||||
|
||||
def teardown_method(self, method):
|
||||
self.cleanup_db(TEST_DB)
|
||||
|
||||
def __populate_test_table(self):
|
||||
"""Copy files into the HDFS directories of two partitions of the table.
|
||||
The goal is to have both an empty and non-empty partition with hidden files."""
|
||||
|
||||
tbl_loc = "/test-warehouse/%s.db/%s" % (TEST_DB, TEST_TBL)
|
||||
|
||||
# Copy a visible file into one of the partitions.
|
||||
check_call(["hadoop", "fs", "-cp",
|
||||
"/test-warehouse/alltypes/year=2010/month=1/100101.txt",
|
||||
"%s/year=2010/month=1/100101.txt" % tbl_loc], shell=False)
|
||||
# Add hidden files to the non-empty partition. Use upper case hidden suffixes.
|
||||
check_call(["hadoop", "fs", "-cp",
|
||||
"/test-warehouse/alltypes/year=2010/month=1/100101.txt",
|
||||
"%s/year=2010/month=1/.100101.txt" % tbl_loc], shell=False)
|
||||
check_call(["hadoop", "fs", "-cp",
|
||||
"/test-warehouse/alltypes/year=2010/month=1/100101.txt",
|
||||
"%s/year=2010/month=1/_100101.txt" % tbl_loc], shell=False)
|
||||
check_call(["hadoop", "fs", "-cp",
|
||||
"/test-warehouse/alltypes/year=2010/month=1/100101.txt",
|
||||
"%s/year=2010/month=1/100101.txt.COPYING" % tbl_loc], shell=False)
|
||||
check_call(["hadoop", "fs", "-cp",
|
||||
"/test-warehouse/alltypes/year=2010/month=1/100101.txt",
|
||||
"%s/year=2010/month=1/100101.txt.TMP" % tbl_loc], shell=False)
|
||||
# Add hidden files to the empty partition. Use lower case hidden suffixes.
|
||||
check_call(["hadoop", "fs", "-cp",
|
||||
"/test-warehouse/alltypes/year=2010/month=2/100201.txt",
|
||||
"%s/year=2010/month=2/.100201.txt" % tbl_loc], shell=False)
|
||||
check_call(["hadoop", "fs", "-cp",
|
||||
"/test-warehouse/alltypes/year=2010/month=2/100201.txt",
|
||||
"%s/year=2010/month=2/_100201.txt" % tbl_loc], shell=False)
|
||||
check_call(["hadoop", "fs", "-cp",
|
||||
"/test-warehouse/alltypes/year=2010/month=2/100201.txt",
|
||||
"%s/year=2010/month=2/100201.txt.copying" % tbl_loc], shell=False)
|
||||
check_call(["hadoop", "fs", "-cp",
|
||||
"/test-warehouse/alltypes/year=2010/month=2/100201.txt",
|
||||
"%s/year=2010/month=2/100201.txt.tmp" % tbl_loc], shell=False)
|
||||
|
||||
@pytest.mark.execute_serially
|
||||
def test_hidden_files_load(self, vector):
|
||||
"""Tests that an incremental refresh ignores hidden files."""
|
||||
self.client.execute("invalidate metadata %s.hf" % TEST_DB)
|
||||
self.run_test_case('QueryTest/hidden-files', vector)
|
||||
|
||||
@pytest.mark.execute_serially
|
||||
def test_hidden_files_refresh(self, vector):
|
||||
"""Tests that an incremental refresh ignores hidden files."""
|
||||
self.client.execute("refresh %s.hf" % TEST_DB)
|
||||
self.run_test_case('QueryTest/hidden-files', vector)
|
||||
Reference in New Issue
Block a user