mirror of
https://github.com/apache/impala.git
synced 2025-12-19 18:12:08 -05:00
IMPALA-9923: Load ORC serially to hack around flakiness
ORC dataload has been intermittently failing with "Fail to get checksum, since file .../_orc_acid_version is under construction." This is due to some Hive/HDFS interaction that seems to get worse with parallelism. This has been hitting a lot of developer tests. As a temporary workaround, this changes dataload to load ORC serially. This is slightly slower, but it should be more reliable. Testing: - Ran precommit tests, manually verified dataload logs Change-Id: I15eff1ec6cab32c1216ed7400e4c4b57bb81e4cd Reviewed-on: http://gerrit.cloudera.org:8080/16292 Reviewed-by: Tim Armstrong <tarmstrong@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
This commit is contained in:
@@ -415,6 +415,7 @@ def main():
|
||||
|
||||
impala_create_files = []
|
||||
hive_load_text_files = []
|
||||
hive_load_orc_files = []
|
||||
hive_load_nontext_files = []
|
||||
hbase_create_files = []
|
||||
hbase_postload_files = []
|
||||
@@ -426,6 +427,8 @@ def main():
|
||||
elif hive_load_match in filename:
|
||||
if 'text-none-none' in filename:
|
||||
hive_load_text_files.append(filename)
|
||||
elif 'orc-def-block' in filename:
|
||||
hive_load_orc_files.append(filename)
|
||||
else:
|
||||
hive_load_nontext_files.append(filename)
|
||||
elif hbase_create_match in filename:
|
||||
@@ -448,6 +451,7 @@ def main():
|
||||
|
||||
log_file_list("Impala Create Files:", impala_create_files)
|
||||
log_file_list("Hive Load Text Files:", hive_load_text_files)
|
||||
log_file_list("Hive Load Orc Files:", hive_load_orc_files)
|
||||
log_file_list("Hive Load Non-Text Files:", hive_load_nontext_files)
|
||||
log_file_list("HBase Create Files:", hbase_create_files)
|
||||
log_file_list("HBase Post-Load Files:", hbase_postload_files)
|
||||
@@ -472,6 +476,13 @@ def main():
|
||||
# need to be loaded first
|
||||
assert(len(hive_load_text_files) <= 1)
|
||||
hive_exec_query_files_parallel(thread_pool, hive_load_text_files)
|
||||
# IMPALA-9923: Run ORC serially separately from other non-text formats. This hacks
|
||||
# around flakiness seen when loading this in parallel. This should be removed as
|
||||
# soon as possible.
|
||||
assert(len(hive_load_orc_files) <= 1)
|
||||
hive_exec_query_files_parallel(thread_pool, hive_load_orc_files)
|
||||
|
||||
# Load all non-text formats (goes parallel)
|
||||
hive_exec_query_files_parallel(thread_pool, hive_load_nontext_files)
|
||||
|
||||
assert(len(hbase_postload_files) <= 1)
|
||||
|
||||
Reference in New Issue
Block a user