Files
impala/tests/query_test/test_insert.py
Sahil Takiar 8b8a49e617 IMPALA-8557: Add '.txt' to text files, remove '.' at end of filenames
Writes to text tables on ABFS are failing because HADOOP-15860 recently
changed the ABFS behavior when writing files / folders that end with a
'.'. ABFS explicitly does not allow files / folders that end with a dot.
From the ABFS docs: "Avoid blob names that end with a dot (.), a forward
slash (/), or a sequence or combination of the two."

The behavior prior to HADOOP-15860 was to simply drop any trailing dots
when writing files or folders, but that can lead to various issues
because clients may try to read back a file that should exist on ABFS,
but doesn't. HADOOP-15860 changed the behavior so that any attempt to
write a file or folder with a trailing dot fails on ABFS.

Impala writes all text files with a trailing dot due to some odd
behavior in hdfs-table-sink.cc. The table sink writes files with
a "file extension" which is dependent on the file type. For example,
Parquet files have a file extension of ".parq". For some reason, text
files had no file extension, so Impala would try to write text files of
the following form:
"244c5ee8ece6f759-8b1a1e3b00000000_45513034_data.0.".

Several tables created during dataload, such as alltypes, already use
the '.txt' extension for their files. These tables are not created via
Impala's INSERT code path, they are copied into the table. However,
there are several tables created during dataload, such as
alltypesinsert, that are created via Impala. This patch will change
the files in these tables so that they end in '.txt'.

This patch adds the ".txt" extension to all written text files and
modifies the hdfs-table-sink.cc so that it doesn't add a trailing dot to
a filename if there is no file extension.

Testing:
* Ran core tests
* Re-ran affected ABFS tests
* Added test to validate that the correct file extension is used for
Parquet and text tables
* Manually validated that without the addition of the '.txt' file
extension, files are not written with a trailing dot

Change-Id: I2a9adacd45855cde86724e10f8a131e17ebf46f8
Reviewed-on: http://gerrit.cloudera.org:8080/14621
Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
2019-11-06 05:49:31 +00:00

342 lines
15 KiB
Python

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# Targeted Impala insert tests
import os
import pytest
from testdata.common import widetable
from tests.common.impala_cluster import ImpalaCluster
from tests.common.impala_test_suite import ImpalaTestSuite
from tests.common.parametrize import UniqueDatabase
from tests.common.skip import SkipIfABFS, SkipIfEC, SkipIfLocal, \
SkipIfHive2, SkipIfNotHdfsMinicluster, SkipIfS3, SkipIfDockerizedCluster
from tests.common.test_dimensions import (
create_exec_option_dimension,
create_uncompressed_text_dimension)
from tests.common.test_result_verifier import (
QueryTestResult,
parse_result_rows)
from tests.common.test_vector import ImpalaTestDimension
from tests.verifiers.metric_verifier import MetricVerifier
PARQUET_CODECS = ['none', 'snappy', 'gzip', 'zstd', 'lz4']
class TestInsertQueries(ImpalaTestSuite):
@classmethod
def get_workload(self):
return 'functional-query'
@classmethod
def add_test_dimensions(cls):
super(TestInsertQueries, cls).add_test_dimensions()
# Fix the exec_option vector to have a single value. This is needed should we decide
# to run the insert tests in parallel (otherwise there will be two tests inserting
# into the same table at the same time for the same file format).
# TODO: When we do decide to run these tests in parallel we could create unique temp
# tables for each test case to resolve the concurrency problems.
if cls.exploration_strategy() == 'core':
cls.ImpalaTestMatrix.add_dimension(create_exec_option_dimension(
cluster_sizes=[0], disable_codegen_options=[True, False], batch_sizes=[0],
sync_ddl=[0]))
cls.ImpalaTestMatrix.add_dimension(
create_uncompressed_text_dimension(cls.get_workload()))
else:
cls.ImpalaTestMatrix.add_dimension(create_exec_option_dimension(
cluster_sizes=[0], disable_codegen_options=[True, False], batch_sizes=[0, 1, 16],
sync_ddl=[0, 1]))
cls.ImpalaTestMatrix.add_dimension(
ImpalaTestDimension("compression_codec", *PARQUET_CODECS));
# Insert is currently only supported for text and parquet
# For parquet, we want to iterate through all the compression codecs
# TODO: each column in parquet can have a different codec. We could
# test all the codecs in one table/file with some additional flags.
cls.ImpalaTestMatrix.add_constraint(lambda v:\
v.get_value('table_format').file_format == 'parquet' or \
(v.get_value('table_format').file_format == 'text' and \
v.get_value('compression_codec') == 'none'))
cls.ImpalaTestMatrix.add_constraint(lambda v:\
v.get_value('table_format').compression_codec == 'none')
# Only test other batch sizes for uncompressed parquet to keep the execution time
# within reasonable bounds.
cls.ImpalaTestMatrix.add_constraint(lambda v:\
v.get_value('exec_option')['batch_size'] == 0 or \
(v.get_value('table_format').file_format == 'parquet' and \
v.get_value('compression_codec') == 'none'))
@pytest.mark.execute_serially
def test_insert_large_string(self, vector, unique_database):
"""Test handling of large strings in inserter and scanner."""
if "-Xcheck:jni" in os.environ.get("LIBHDFS_OPTS", ""):
pytest.skip("Test unreasonably slow with JNI checking.")
table_name = unique_database + ".insert_largestring"
self.client.set_configuration_option("mem_limit", "4gb")
file_format = vector.get_value('table_format').file_format
if file_format == "parquet":
stored_as = file_format
else:
assert file_format == "text"
stored_as = "textfile"
self.client.execute("""
create table {0}
stored as {1} as
select repeat('AZ', 128 * 1024 * 1024) as s""".format(table_name, stored_as))
# Make sure it produces correct result when materializing no tuples.
result = self.client.execute("select count(*) from {0}".format(table_name))
assert result.data == ["1"]
# Make sure it got the length right.
result = self.client.execute("select length(s) from {0}".format(table_name))
assert result.data == [str(2 * 128 * 1024 * 1024)]
# Spot-check the data.
result = self.client.execute(
"select substr(s, 200 * 1024 * 1024, 5) from {0}".format(table_name))
assert result.data == ["ZAZAZ"]
# IMPALA-7648: test that we gracefully fail when there is not enough memory
# to fit the scanned string in memory.
self.client.set_configuration_option("mem_limit", "50M")
try:
self.client.execute("select s from {0}".format(table_name))
assert False, "Expected query to fail"
except Exception, e:
assert "Memory limit exceeded" in str(e)
@classmethod
def setup_class(cls):
super(TestInsertQueries, cls).setup_class()
@pytest.mark.execute_serially
# Erasure coding doesn't respect memory limit
@SkipIfEC.fix_later
# ABFS partition names cannot end in periods
@SkipIfABFS.file_or_folder_name_ends_with_period
def test_insert(self, vector):
if (vector.get_value('table_format').file_format == 'parquet'):
vector.get_value('exec_option')['COMPRESSION_CODEC'] = \
vector.get_value('compression_codec')
self.run_test_case('QueryTest/insert', vector,
multiple_impalad=vector.get_value('exec_option')['sync_ddl'] == 1)
@SkipIfHive2.acid
@UniqueDatabase.parametrize(sync_ddl=True)
def test_acid_insert(self, vector, unique_database):
exec_options = vector.get_value('exec_option')
file_format = vector.get_value('table_format').file_format
if (file_format == 'parquet'):
exec_options['COMPRESSION_CODEC'] = vector.get_value('compression_codec')
exec_options['DEFAULT_FILE_FORMAT'] = file_format
self.run_test_case('QueryTest/acid-insert', vector, unique_database,
multiple_impalad=exec_options['sync_ddl'] == 1)
@SkipIfHive2.acid
@UniqueDatabase.parametrize(sync_ddl=True)
def test_acid_nonacid_insert(self, vector, unique_database):
self.run_test_case('QueryTest/acid-nonacid-insert', vector, unique_database,
multiple_impalad=vector.get_value('exec_option')['sync_ddl'] == 1)
@SkipIfHive2.acid
@UniqueDatabase.parametrize(sync_ddl=True)
def test_acid_insert_fail(self, vector, unique_database):
self.run_test_case('QueryTest/acid-insert-fail', vector, unique_database,
multiple_impalad=vector.get_value('exec_option')['sync_ddl'] == 1)
@pytest.mark.execute_serially
@SkipIfNotHdfsMinicluster.tuned_for_minicluster
def test_insert_mem_limit(self, vector):
if (vector.get_value('table_format').file_format == 'parquet'):
vector.get_value('exec_option')['COMPRESSION_CODEC'] = \
vector.get_value('compression_codec')
self.run_test_case('QueryTest/insert-mem-limit', vector,
multiple_impalad=vector.get_value('exec_option')['sync_ddl'] == 1)
# IMPALA-7023: These queries can linger and use up memory, causing subsequent
# tests to hit memory limits. Wait for some time to allow the query to
# be reclaimed.
verifiers = [MetricVerifier(i.service)
for i in ImpalaCluster.get_e2e_test_cluster().impalads]
for v in verifiers:
v.wait_for_metric("impala-server.num-fragments-in-flight", 0, timeout=180)
@pytest.mark.execute_serially
@SkipIfS3.eventually_consistent
def test_insert_overwrite(self, vector):
self.run_test_case('QueryTest/insert_overwrite', vector,
multiple_impalad=vector.get_value('exec_option')['sync_ddl'] == 1)
@pytest.mark.execute_serially
def test_insert_bad_expr(self, vector):
# The test currently relies on codegen being disabled to trigger an error in
# the output expression of the table sink.
if vector.get_value('exec_option')['disable_codegen']:
self.run_test_case('QueryTest/insert_bad_expr', vector,
multiple_impalad=vector.get_value('exec_option')['sync_ddl'] == 1)
@UniqueDatabase.parametrize(sync_ddl=True)
def test_insert_random_partition(self, vector, unique_database):
"""Regression test for IMPALA-402: partitioning by rand() leads to strange behaviour
or crashes."""
self.run_test_case('QueryTest/insert-random-partition', vector, unique_database,
multiple_impalad=vector.get_value('exec_option')['sync_ddl'] == 1)
class TestInsertWideTable(ImpalaTestSuite):
@classmethod
def get_workload(self):
return 'functional-query'
@classmethod
def add_test_dimensions(cls):
super(TestInsertWideTable, cls).add_test_dimensions()
# Only vary codegen
cls.ImpalaTestMatrix.add_dimension(create_exec_option_dimension(
cluster_sizes=[0], disable_codegen_options=[True, False], batch_sizes=[0]))
# Inserts only supported on text and parquet
# TODO: Enable 'text'/codec once the compressed text writers are in.
cls.ImpalaTestMatrix.add_constraint(lambda v:\
v.get_value('table_format').file_format == 'parquet' or \
v.get_value('table_format').file_format == 'text')
cls.ImpalaTestMatrix.add_constraint(lambda v:\
v.get_value('table_format').compression_codec == 'none')
# Don't run on core. This test is very slow (IMPALA-864) and we are unlikely to
# regress here.
if cls.exploration_strategy() == 'core':
cls.ImpalaTestMatrix.add_constraint(lambda v: False);
@SkipIfLocal.parquet_file_size
def test_insert_wide_table(self, vector, unique_database):
table_format = vector.get_value('table_format')
# Text can't handle as many columns as Parquet (codegen takes forever)
num_cols = 1000 if table_format.file_format == 'text' else 2000
table_name = unique_database + ".insert_widetable"
if vector.get_value('exec_option')['disable_codegen']:
table_name += "_codegen_disabled"
col_descs = widetable.get_columns(num_cols)
create_stmt = "CREATE TABLE " + table_name + "(" + ','.join(col_descs) + ")"
if vector.get_value('table_format').file_format == 'parquet':
create_stmt += " stored as parquet"
self.client.execute(create_stmt)
# Get a single row of data
col_vals = widetable.get_data(num_cols, 1, quote_strings=True)[0]
insert_stmt = "INSERT INTO " + table_name + " VALUES(" + col_vals + ")"
self.client.execute(insert_stmt)
result = self.client.execute("select count(*) from " + table_name)
assert result.data == ["1"]
result = self.client.execute("select * from " + table_name)
types = result.column_types
labels = result.column_labels
expected = QueryTestResult([col_vals], types, labels, order_matters=False)
actual = QueryTestResult(parse_result_rows(result), types, labels, order_matters=False)
assert expected == actual
class TestInsertPartKey(ImpalaTestSuite):
"""Regression test for IMPALA-875"""
@classmethod
def get_workload(self):
return 'functional-query'
@classmethod
def add_test_dimensions(cls):
super(TestInsertPartKey, cls).add_test_dimensions()
# Only run for a single table type
cls.ImpalaTestMatrix.add_dimension(create_exec_option_dimension(
cluster_sizes=[0], disable_codegen_options=[False], batch_sizes=[0],
sync_ddl=[1]))
cls.ImpalaTestMatrix.add_constraint(lambda v:
(v.get_value('table_format').file_format == 'text'))
cls.ImpalaTestMatrix.add_constraint(lambda v:\
v.get_value('table_format').compression_codec == 'none')
@pytest.mark.execute_serially
def test_insert_part_key(self, vector):
"""Test that partition column exprs are cast to the correct type. See IMPALA-875."""
self.run_test_case('QueryTest/insert_part_key', vector,
multiple_impalad=vector.get_value('exec_option')['sync_ddl'] == 1)
class TestInsertNullQueries(ImpalaTestSuite):
@classmethod
def get_workload(self):
return 'functional-query'
@classmethod
def add_test_dimensions(cls):
super(TestInsertNullQueries, cls).add_test_dimensions()
# Fix the exec_option vector to have a single value. This is needed should we decide
# to run the insert tests in parallel (otherwise there will be two tests inserting
# into the same table at the same time for the same file format).
# TODO: When we do decide to run these tests in parallel we could create unique temp
# tables for each test case to resolve the concurrency problems.
cls.ImpalaTestMatrix.add_dimension(create_exec_option_dimension(
cluster_sizes=[0], disable_codegen_options=[False], batch_sizes=[0]))
# These tests only make sense for inserting into a text table with special
# logic to handle all the possible ways NULL needs to be written as ascii
cls.ImpalaTestMatrix.add_constraint(lambda v:\
(v.get_value('table_format').file_format == 'text' and \
v.get_value('table_format').compression_codec == 'none'))
@classmethod
def setup_class(cls):
super(TestInsertNullQueries, cls).setup_class()
@pytest.mark.execute_serially
def test_insert_null(self, vector):
self.run_test_case('QueryTest/insert_null', vector)
class TestInsertFileExtension(ImpalaTestSuite):
"""Tests that files written to a table have the correct file extension. Asserts that
Parquet files end with .parq and text files end with .txt."""
@classmethod
def get_workload(self):
return 'functional-query'
@classmethod
def add_test_dimensions(cls):
cls.ImpalaTestMatrix.add_dimension(ImpalaTestDimension(
'table_format_and_file_extension',
*[('parquet', '.parq'), ('textfile', '.txt')]))
@classmethod
def setup_class(cls):
super(TestInsertFileExtension, cls).setup_class()
def test_file_extension(self, vector, unique_database):
table_format = vector.get_value('table_format_and_file_extension')[0]
file_extension = vector.get_value('table_format_and_file_extension')[1]
table_name = "{0}_table".format(table_format)
ctas_query = "create table {0}.{1} stored as {2} as select 1".format(
unique_database, table_name, table_format)
self.execute_query_expect_success(self.client, ctas_query)
for path in self.filesystem_client.ls("test-warehouse/{0}.db/{1}".format(
unique_database, table_name)):
if not path.startswith('_'): assert path.endswith(file_extension)