IMPALA-9709: Remove Impala-lzo from the development environment

This removes Impala-lzo from the Impala development environment.
Impala-lzo is not built as part of the Impala build. The LZO plugin
is no longer loaded. LZO tables are not loaded during dataload,
and LZO is no longer tested.

This removes some obsolete scan APIs that were only used by Impala-lzo.
With this commit, Impala-lzo would require code changes to build
against Impala.

The plugin infrastructure is not removed, and this leaves some
LZO support code in place. If someone were to decide to revive
Impala-lzo, they would still be able to load it as a plugin
and get the same functionality as before. This plugin support
may be removed later.

Testing:
 - Dryrun of GVO
 - Modified TestPartitionMetadataUncompressedTextOnly's
   test_unsupported_text_compression() to add LZO case

Change-Id: I3a4f12247d8872b7e14c9feb4b2c58cfd60d4c0e
Reviewed-on: http://gerrit.cloudera.org:8080/15814
Reviewed-by: Bikramjeet Vig <bikramjeet.vig@cloudera.com>
Tested-by: Joe McDonnell <joemcdonnell@cloudera.com>
This commit is contained in:
Joe McDonnell
2020-04-26 18:38:26 -07:00
parent 38b9617462
commit f15a311065
65 changed files with 88 additions and 346 deletions

View File

@@ -438,17 +438,6 @@ add_custom_target(cscope ALL DEPENDS gen-deps
COMMAND "${CMAKE_SOURCE_DIR}/bin/gen-cscope.sh"
)
# This call is passing IMPALA_TOOLCHAIN_PACKAGES_HOME into Impala-lzo's build.sh,
# but this is known not to work with the current version of Impala-lzo when
# IMPALA_TOOLCHAIN_PACKAGES_HOME is a subdirectory of IMPALA_TOOLCHAIN. Either
# Impala-lzo will need to be fixed or it will need to be removed.
if (DEFINED ENV{IMPALA_LZO} AND EXISTS $ENV{IMPALA_LZO})
add_custom_target(impala-lzo ALL DEPENDS gen-deps
COMMAND $ENV{IMPALA_LZO}/build.sh ${CMAKE_BUILD_TYPE} ${CMAKE_SOURCE_DIR}
$ENV{IMPALA_TOOLCHAIN_PACKAGES_HOME}
)
endif()
# Dump include paths to a file
if (DUMP_INCLUDE_PATHS)
file(REMOVE "${DUMP_INCLUDE_PATHS}")

View File

@@ -42,9 +42,9 @@ using boost::upgrade_lock;
using boost::upgrade_to_unique_lock;
using std::find;
// Allow LZO by default to maintain backwards compatibility. We can add more options
// if we determine that the plugins are well-maintained and generally stable.
DEFINE_string(enabled_hdfs_text_scanner_plugins, "LZO", "(Advanced) whitelist of HDFS "
// LZO is no longer supported, so there are no plugins enabled by default. This is
// likely to be removed.
DEFINE_string(enabled_hdfs_text_scanner_plugins, "", "(Advanced) whitelist of HDFS "
"text scanner plugins that Impala will try to dynamically load. Must be a "
"comma-separated list of upper-case compression codec names. Each plugin implements "
"support for decompression and hands off the decompressed bytes to Impala's builtin "

View File

@@ -824,14 +824,6 @@ ScanRange* HdfsScanNodeBase::AllocateScanRange(hdfsFS fs, const char* file, int6
buffer_opts);
}
ScanRange* HdfsScanNodeBase::AllocateScanRange(hdfsFS fs, const char* file,
int64_t len, int64_t offset, int64_t partition_id, int disk_id,
int cache_options, bool expected_local, int64_t mtime,
bool is_erasure_coded, const ScanRange* original_split) {
return AllocateScanRange(fs, file, len, offset, partition_id, disk_id, expected_local,
is_erasure_coded, mtime, BufferOpts(cache_options), original_split);
}
void* HdfsScanNodeBase::GetCodegenFn(THdfsFileFormat::type type) {
auto it = codegend_fn_map_.find(type);
if (it == codegend_fn_map_.end()) return NULL;
@@ -1166,7 +1158,7 @@ void HdfsScanNodeBase::UpdateBytesRead(
}
}
HdfsFileDesc* ScanRangeSharedState::GetFileDesc(
const HdfsFileDesc* ScanRangeSharedState::GetFileDesc(
int64_t partition_id, const std::string& filename) {
auto file_desc_map_key = make_pair(partition_id, filename);
DCHECK(file_descs_.find(file_desc_map_key) != file_descs_.end());

View File

@@ -130,9 +130,7 @@ class ScanRangeSharedState {
public:
/// Given a partition_id and filename returns the related file descriptor DCHECK ensures
/// there is always file descriptor returned.
/// TODO: The LZO scanner expects a non const object so switch to returning a const once
/// support for LZO scanner is removed.
HdfsFileDesc* GetFileDesc(int64_t partition_id, const std::string& filename);
const HdfsFileDesc* GetFileDesc(int64_t partition_id, const std::string& filename);
/// Sets the scanner specific metadata for 'partition_id' and 'filename'.
/// Scanners can use this to store file header information. Thread safe.
@@ -497,12 +495,6 @@ class HdfsScanNodeBase : public ScanNode {
ScanRangeMetadata* metadata, int disk_id, bool expected_local,
bool is_erasure_coded, int64_t mtime, const io::BufferOpts& buffer_opts);
/// Old API for compatibility with text scanners (e.g. LZO text scanner).
io::ScanRange* AllocateScanRange(hdfsFS fs, const char* file, int64_t len,
int64_t offset, int64_t partition_id, int disk_id, int cache_options,
bool expected_local, int64_t mtime, bool is_erasure_coded = false,
const io::ScanRange* original_split = nullptr);
/// Adds ranges to be read later by scanners. Must not be called once
/// remaining_scan_range_submissions_ is 0. The enqueue_location specifies whether the
/// scan ranges are added to the head or tail of the queue. Implemented by child classes
@@ -525,7 +517,7 @@ class HdfsScanNodeBase : public ScanNode {
/// Given a partition_id and filename returns the related file descriptor
/// DCHECK ensures there is always file descriptor returned
inline HdfsFileDesc* GetFileDesc(
inline const HdfsFileDesc* GetFileDesc(
int64_t partition_id, const std::string& filename) {
return shared_state_->GetFileDesc(partition_id, filename);
}

View File

@@ -48,7 +48,7 @@ const char* const Codec::ZSTD_COMPRESSION =
const char* const Codec::UNKNOWN_CODEC_ERROR =
"This compression codec is currently unsupported: ";
const char* const NO_LZO_MSG = "LZO codecs may not be created via the Codec interface. "
"Instead the LZO library is directly invoked.";
"Instead LZO is decoded by an optional text scanner plugin.";
const Codec::CodecMap Codec::CODEC_MAP = {{"", THdfsCompression::NONE},
{DEFAULT_COMPRESSION, THdfsCompression::DEFAULT},

View File

@@ -199,7 +199,7 @@ function apt-get {
echo ">>> Installing build tools"
ubuntu apt-get update
ubuntu apt-get --yes install ccache curl gawk g++ gcc libffi-dev liblzo2-dev \
ubuntu apt-get --yes install ccache curl gawk g++ gcc libffi-dev \
libkrb5-dev krb5-admin-server krb5-kdc krb5-user libsasl2-dev \
libsasl2-modules libsasl2-modules-gssapi-mit libssl-dev make ninja-build \
python-dev python-setuptools postgresql ssh wget vim-common psmisc \
@@ -240,7 +240,7 @@ redhat sudo yum install -y curl gawk gcc gcc-c++ git krb5-devel krb5-server \
krb5-workstation libevent-devel libffi-devel make openssl-devel cyrus-sasl \
cyrus-sasl-gssapi cyrus-sasl-devel cyrus-sasl-plain \
postgresql postgresql-server \
wget vim-common nscd cmake lzo-devel fuse-devel zlib-devel \
wget vim-common nscd cmake fuse-devel zlib-devel \
psmisc lsof openssh-server redhat-lsb java-1.8.0-openjdk-devel \
java-1.8.0-openjdk-src
@@ -453,25 +453,6 @@ eval "$SET_JAVA_HOME"
# Assert that we have a java available
test -f $JAVA_HOME/bin/java
# LZO is not needed to compile or run Impala, but it is needed for the data load
echo ">>> Checking out Impala-lzo"
: ${IMPALA_LZO_HOME:="${IMPALA_HOME}/../Impala-lzo"}
if ! [[ -d "$IMPALA_LZO_HOME" ]]
then
git clone --branch master https://github.com/cloudera/impala-lzo.git "$IMPALA_LZO_HOME"
fi
echo ">>> Checking out and building hadoop-lzo"
: ${HADOOP_LZO_HOME:="${IMPALA_HOME}/../hadoop-lzo"}
if ! [[ -d "$HADOOP_LZO_HOME" ]]
then
git clone https://github.com/cloudera/hadoop-lzo.git "$HADOOP_LZO_HOME"
fi
cd "$HADOOP_LZO_HOME"
time -p ant package
cd "$IMPALA_HOME"
# Try to prepopulate the m2 directory to save time
if ! bin/jenkins/populate_m2_directory.py ; then
echo "Failed to prepopulate the m2 directory. Continuing..."

View File

@@ -67,12 +67,5 @@ popd
rm -f "${IMPALA_HOME}/llvm-ir/"impala*.ll
rm -f "${IMPALA_HOME}/be/generated-sources/impala-ir/"*
# Cleanup Impala-lzo
if [ -e "${IMPALA_LZO}" ]; then
pushd "${IMPALA_LZO}"
git rev-parse 2>/dev/null && git clean -fdx
popd
fi
# When switching to and from toolchain, make sure to remove all CMake generated files
"${IMPALA_HOME}/bin/clean-cmake.sh"

View File

@@ -325,8 +325,6 @@ export DOWNLOAD_CDH_COMPONENTS=${DOWNLOAD_CDH_COMPONENTS-true}
export IS_OSX="$(if [[ "$OSTYPE" == "darwin"* ]]; then echo true; else echo false; fi)"
export HADOOP_LZO="${HADOOP_LZO-$IMPALA_HOME/../hadoop-lzo}"
export IMPALA_LZO="${IMPALA_LZO-$IMPALA_HOME/../Impala-lzo}"
export IMPALA_AUX_TEST_HOME="${IMPALA_AUX_TEST_HOME-$IMPALA_HOME/../Impala-auxiliary-tests}"
export TARGET_FILESYSTEM="${TARGET_FILESYSTEM-hdfs}"
export ERASURE_CODING="${ERASURE_CODING-false}"
@@ -568,18 +566,13 @@ export HADOOP_CONF_DIR="$IMPALA_FE_DIR/src/test/resources"
export HADOOP_INCLUDE_DIR=${HADOOP_INCLUDE_DIR_OVERRIDE:-"${HADOOP_HOME}/include"}
export HADOOP_LIB_DIR=${HADOOP_LIB_DIR_OVERRIDE:-"${HADOOP_HOME}/lib"}
# Please note that the * is inside quotes, thus it won't get expanded by bash but
# by java, see "Understanding class path wildcards" at http://goo.gl/f0cfft
export HADOOP_CLASSPATH="${HADOOP_CLASSPATH-}:${HADOOP_HOME}/share/hadoop/tools/lib/*"
# YARN is configured to use LZO so the LZO jar needs to be in the hadoop classpath.
export LZO_JAR_PATH="$HADOOP_LZO/build/hadoop-lzo-0.4.15.jar"
HADOOP_CLASSPATH+=":$LZO_JAR_PATH"
# Beware of adding entries from $HADOOP_HOME here, because they can change
# the order of the classpath, leading to configuration not showing up first.
HADOOP_CLASSPATH="$LZO_JAR_PATH"
export HADOOP_CLASSPATH="${HADOOP_CLASSPATH-}"
# Add the path containing the hadoop-aws jar, which is required to access AWS from the
# minicluster.
# Please note that the * is inside quotes, thus it won't get expanded by bash but
# by java, see "Understanding class path wildcards" at http://goo.gl/f0cfft
HADOOP_CLASSPATH="${HADOOP_CLASSPATH}:${HADOOP_HOME}/share/hadoop/tools/lib/*"
export PATH="$HADOOP_HOME/bin:$PATH"
@@ -610,7 +603,7 @@ export HIVE_CONF_DIR="$IMPALA_FE_DIR/./src/test/resources"
export POSTGRES_JDBC_DRIVER="${IMPALA_FE_DIR}/target/dependency/postgresql-${IMPALA_POSTGRES_JDBC_DRIVER_VERSION}.jar"
export HIVE_AUX_JARS_PATH="$POSTGRES_JDBC_DRIVER"
export AUX_CLASSPATH="${LZO_JAR_PATH}"
export AUX_CLASSPATH=""
### Tell hive not to use jline
export HADOOP_USER_CLASSPATH_FIRST=true
@@ -707,7 +700,6 @@ LIBHDFS_OPTS="${LIBHDFS_OPTS} -XX:MaxPermSize=128m"
export CLASSPATH="$IMPALA_FE_DIR/target/dependency:${CLASSPATH:+:${CLASSPATH}}"
CLASSPATH="$IMPALA_FE_DIR/target/classes:$CLASSPATH"
CLASSPATH="$IMPALA_FE_DIR/src/test/resources:$CLASSPATH"
CLASSPATH="$LZO_JAR_PATH:$CLASSPATH"
# A marker in the environment to prove that we really did source this file
export IMPALA_CONFIG_SOURCED=1
@@ -726,8 +718,6 @@ echo "HBASE_CONF_DIR = $HBASE_CONF_DIR"
echo "RANGER_HOME = $RANGER_HOME"
echo "RANGER_CONF_DIR = $RANGER_CONF_DIR "
echo "THRIFT_HOME = $THRIFT_HOME"
echo "HADOOP_LZO = $HADOOP_LZO"
echo "IMPALA_LZO = $IMPALA_LZO"
echo "CLASSPATH = $CLASSPATH"
echo "LIBHDFS_OPTS = $LIBHDFS_OPTS"
echo "JAVA_HOME = $JAVA_HOME"

View File

@@ -21,9 +21,6 @@
# run Impala binaries in the context of a dev environment.
export LD_LIBRARY_PATH="${LD_LIBRARY_PATH:+${LD_LIBRARY_PATH}:}"
# Impala-lzo is loaded at runtime, so needs to be on the search path.
LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${IMPALA_LZO}/build"
# We built against toolchain GCC so we need to dynamically link against matching
# library versions. (the rpath isn't baked into the binaries)
IMPALA_TOOLCHAIN_GCC_LIB=\

View File

@@ -139,7 +139,6 @@ options, args = parser.parse_args()
IMPALA_HOME = os.environ["IMPALA_HOME"]
CORE_SITE_PATH = os.path.join(IMPALA_HOME, "fe/src/test/resources/core-site.xml")
KNOWN_BUILD_TYPES = ["debug", "release", "latest"]
IMPALA_LZO = os.environ["IMPALA_LZO"]
# The location in the container where the cache is always mounted.
DATA_CACHE_CONTAINER_PATH = "/opt/impala/cache"
@@ -623,12 +622,6 @@ class DockerMiniClusterOperations(object):
# Run the container as the current user.
user_args = ["--user", "{0}:{1}".format(os.getuid(), os.getgid())]
# Allow loading LZO plugin, if built.
lzo_lib_dir = os.path.join(IMPALA_LZO, "build")
if os.path.isdir(lzo_lib_dir):
mount_args += ["--mount",
"type=bind,src={0},dst=/opt/impala/lib/plugins".format(lzo_lib_dir)]
mem_limit_args = []
if mem_limit is not None:
mem_limit_args = ["--memory", str(mem_limit)]

View File

@@ -434,9 +434,6 @@ build_all_components() {
if (( build_independent_targets )); then
MAKE_TARGETS+=" cscope fe tarballs"
fi
if [[ -e "$IMPALA_LZO" ]]; then
MAKE_TARGETS+=" impala-lzo"
fi
fi
${MAKE_CMD} -j${IMPALA_BUILD_THREADS:-4} ${IMPALA_MAKE_FLAGS} ${MAKE_TARGETS}
}
@@ -518,13 +515,6 @@ reconfigure_test_cluster() {
# Generate the Hadoop configs needed by Impala
"${IMPALA_HOME}/bin/create-test-configuration.sh" ${CREATE_TEST_CONFIG_ARGS}
# Copy Hadoop-lzo dependencies if available (required to generate Lzo data).
if stat "$HADOOP_LZO"/build/native/Linux-*-*/lib/libgplcompression.* > /dev/null ; then
cp "$HADOOP_LZO"/build/native/Linux-*-*/lib/libgplcompression.* "$HADOOP_LIB_DIR/native"
else
echo "No hadoop-lzo found"
fi
}
# Starts the test cluster processes except for Impala.

View File

@@ -250,14 +250,6 @@ function build_impdev() {
git fetch /git_common_dir --no-tags "$GIT_HEAD_REV"
git checkout -b test-with-docker FETCH_HEAD
# Checkout impala-lzo too
mkdir /home/impdev/Impala-lzo
pushd /home/impdev/Impala-lzo
git init
git fetch $IMPALA_LZO_REPO --no-tags "$IMPALA_LZO_REF"
git checkout -b test-with-docker FETCH_HEAD
popd
# Link in logs. Logs are on the host since that's the most important thing to
# look at after the tests are run.
ln -sf /logs logs

View File

@@ -18,13 +18,11 @@ ARG BASE_IMAGE=ubuntu:16.04
FROM ${BASE_IMAGE}
# Install minimal dependencies required for Impala services to run.
# liblzo2-2 may be needed by the Impala-lzo plugin, which is used in tests.
# We install it in the base image for convenience.
RUN apt-get update && \
apt-get install -y openjdk-8-jre-headless \
libsasl2-2 libsasl2-modules libsasl2-modules-gssapi-mit \
sudo netcat-openbsd less curl iproute2 vim iputils-ping \
tzdata liblzo2-2 krb5-user && \
tzdata krb5-user && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

View File

@@ -184,11 +184,6 @@ def main():
default=os.path.expanduser("~/.ccache"))
parser.add_argument('--tail', action="store_true",
help="Run tail on all container log files.")
parser.add_argument('--impala-lzo-repo',
default="https://github.com/cloudera/impala-lzo.git",
help="Git repo for Impala-lzo repo")
parser.add_argument('--impala-lzo-ref', default='master',
help="Branch name for Impala-lzo repo.")
parser.add_argument('--env', metavar='K=V', default=[], action='append',
help="""Passes given environment variables (expressed as KEY=VALUE)
through containers.
@@ -210,8 +205,6 @@ def main():
suite_concurrency=args.suite_concurrency,
impalad_mem_limit_bytes=args.impalad_mem_limit_bytes,
tail=args.tail,
impala_lzo_repo=args.impala_lzo_repo,
impala_lzo_ref=args.impala_lzo_ref,
env=args.env, base_image=args.base_image)
fh = logging.FileHandler(os.path.join(_make_dir_if_not_exist(t.log_dir), "log.txt"))
@@ -449,7 +442,7 @@ class TestWithDocker(object):
cleanup_image, ccache_dir, test_mode,
suite_concurrency, parallel_test_concurrency,
impalad_mem_limit_bytes, tail,
impala_lzo_repo, impala_lzo_ref, env, base_image):
env, base_image):
self.build_image = build_image
self.name = name
self.containers = []
@@ -485,8 +478,6 @@ class TestWithDocker(object):
self.parallel_test_concurrency = parallel_test_concurrency
self.impalad_mem_limit_bytes = impalad_mem_limit_bytes
self.tail = tail
self.impala_lzo_repo = impala_lzo_repo
self.impala_lzo_ref = impala_lzo_ref
self.env = env
self.base_image = base_image
@@ -571,8 +562,6 @@ class TestWithDocker(object):
"-v", self.git_root + ":/repo:ro",
"-v", self.git_common_dir + ":/git_common_dir:ro",
"-e", "GIT_HEAD_REV=" + self.git_head_rev,
"-e", "IMPALA_LZO_REPO=" + self.impala_lzo_repo,
"-e", "IMPALA_LZO_REF=" + self.impala_lzo_ref,
# Share timezone between host and container
"-e", "LOCALTIME_LINK_TARGET=" + localtime_link_target,
"-v", self.ccache_dir + ":/ccache",

View File

@@ -468,8 +468,7 @@ public class ToSqlUtils {
}
if (storageHandlerClass == null) {
// TODO: Remove this special case when we have the LZO_TEXT writer
// We must handle LZO_TEXT specially because Impala does not yet support creating
// We must handle LZO_TEXT specially because Impala does not support creating
// tables with this row format. In this case, we cannot output "WITH
// SERDEPROPERTIES" because Hive does not support it with "STORED AS". For any
// other HdfsFileFormat we want to output the serdeproperties because it is

View File

@@ -27,10 +27,12 @@ import com.google.common.collect.ImmutableMap;
* Support for recognizing compression suffixes on data files.
* Compression of a file is recognized in mapreduce by looking for suffixes of
* supported codecs.
* For now Impala supports LZO, GZIP, SNAPPY, BZIP2 and some additional formats if plugins
* For now Impala supports GZIP, SNAPPY, BZIP2 and some additional formats if plugins
* are available. Even if a plugin is available, we need to add the file suffixes here so
* that we can resolve the compression type from the file name. LZO can use the specific
* HIVE input class.
* Some compression types here are detected even though they are not supported. This
* allows for better error messages (e.g. LZ4, LZO).
*/
public enum HdfsCompression {
NONE,

View File

@@ -49,6 +49,9 @@ public enum HdfsFileFormat {
"org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat",
"org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe",
false, false, true),
// LZO_TEXT is never used as an actual HdfsFileFormat. It is used only to store the
// input format class and match against it (e.g. in HdfsCompression). Outside of this
// file, tables that use the LZO input format class use HdfsFileFormat.TEXT.
LZO_TEXT("com.hadoop.mapred.DeprecatedLzoTextInputFormat",
"org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat",
"", false, false, true),
@@ -194,8 +197,7 @@ public enum HdfsFileFormat {
case TEXT:
if (compressionType == HdfsCompression.LZO ||
compressionType == HdfsCompression.LZO_INDEX) {
// TODO: Update this when we can write LZO text.
// It is not currently possible to create a table with LZO compressed text files
// It is not possible to create a table with LZO compressed text files
// in Impala, but this is valid in Hive.
return String.format("INPUTFORMAT '%s' OUTPUTFORMAT '%s'",
LZO_TEXT.inputFormat(), LZO_TEXT.outputFormat());

View File

@@ -179,7 +179,6 @@ public class HdfsScanNode extends ScanNode {
ImmutableSet.<HdfsFileFormat>builder()
.add(HdfsFileFormat.RC_FILE)
.add(HdfsFileFormat.TEXT)
.add(HdfsFileFormat.LZO_TEXT)
.add(HdfsFileFormat.SEQUENCE_FILE)
.add(HdfsFileFormat.AVRO)
.build();

View File

@@ -62,8 +62,8 @@ public class HdfsTableSink extends TableSink {
protected final boolean inputIsClustered_;
private static final Set<HdfsFileFormat> SUPPORTED_FILE_FORMATS = ImmutableSet.of(
HdfsFileFormat.PARQUET, HdfsFileFormat.TEXT, HdfsFileFormat.LZO_TEXT,
HdfsFileFormat.RC_FILE, HdfsFileFormat.SEQUENCE_FILE, HdfsFileFormat.AVRO);
HdfsFileFormat.PARQUET, HdfsFileFormat.TEXT, HdfsFileFormat.RC_FILE,
HdfsFileFormat.SEQUENCE_FILE, HdfsFileFormat.AVRO);
// Stores the indices into the list of non-clustering columns of the target table that
// are stored in the 'sort.columns' table property. This is sent to the backend to
@@ -150,7 +150,7 @@ public class HdfsTableSink extends TableSink {
return 1024L * 1024L * 1024L;
}
// For all other supported formats (TEXT, LZO_TEXT, RC_FILE, SEQUENCE_FILE & AVRO)
// For all other supported formats (TEXT, RC_FILE, SEQUENCE_FILE & AVRO)
// 100KB is a very approximate estimate of the amount of data buffered.
return 100L * 1024L;
}

View File

@@ -3385,15 +3385,15 @@ public class AnalyzeStmtsTest extends AnalyzerTest {
// File type / table type mismatch.
AnalyzesOk(String.format("load data inpath '%s' %s into table " +
"tpch.lineitem",
"/test-warehouse/alltypes_text_lzo/year=2009/month=4", overwrite));
"/test-warehouse/alltypes_text_gzip/year=2009/month=4", overwrite));
// When table type matches, analysis passes for partitioned and unpartitioned
// tables.
AnalyzesOk(String.format("load data inpath '%s' %s into table " +
"functional_text_lzo.alltypes partition(year=2009, month=4)",
"/test-warehouse/alltypes_text_lzo/year=2009/month=4", overwrite));
"functional_text_gzip.alltypes partition(year=2009, month=4)",
"/test-warehouse/alltypes_text_gzip/year=2009/month=4", overwrite));
AnalyzesOk(String.format("load data inpath '%s' %s into table " +
"functional_text_lzo.jointbl",
"/test-warehouse/alltypes_text_lzo/year=2009/month=4", overwrite));
"functional_text_gzip.jointbl",
"/test-warehouse/alltypes_text_gzip/year=2009/month=4", overwrite));
// Verify with a read-only table
AnalysisError(String.format("load data inpath '%s' into table " +

View File

@@ -101,12 +101,10 @@ public class AnalyzerTest extends FrontendTestBase {
@Test
public void TestCompressedText() throws AnalysisException {
AnalyzesOk("SELECT count(*) FROM functional_text_lzo.tinyinttable");
// TODO: Disabling the text/{gzip,bzip,snap} analysis test until the corresponding
// databases are loaded.
// AnalyzesOk("SELECT count(*) FROM functional_text_gzip.tinyinttable");
// AnalyzesOk("SELECT count(*) FROM functional_text_snap.tinyinttable");
// AnalyzesOk("SELECT count(*) FROM functional_text_bzip.tinyinttable");
AnalyzesOk("SELECT count(*) FROM functional_text_bzip.tinyinttable");
AnalyzesOk("SELECT count(*) FROM functional_text_def.tinyinttable");
AnalyzesOk("SELECT count(*) FROM functional_text_gzip.tinyinttable");
AnalyzesOk("SELECT count(*) FROM functional_text_snap.tinyinttable");
}
@Test

Binary file not shown.

Binary file not shown.

View File

@@ -453,28 +453,6 @@ EOF
}
function load-custom-data {
# Load the index files for corrupted lzo data.
hadoop fs -mkdir -p /test-warehouse/bad_text_lzo_text_lzo
hadoop fs -rm -f /test-warehouse/bad_text_lzo_text_lzo/bad_text.lzo.index
hadoop fs -put ${IMPALA_HOME}/testdata/bad_text_lzo/bad_text.lzo.index \
/test-warehouse/bad_text_lzo_text_lzo/
hadoop fs -rm -r -f /bad_text_lzo_text_lzo/
hadoop fs -mv /test-warehouse/bad_text_lzo_text_lzo/ /
# Cleanup the old bad_text_lzo files, if they exist.
hadoop fs -rm -r -f /test-warehouse/bad_text_lzo/
# TODO: Why is there a REMOTE_LOAD condition? See IMPALA-4347
if [[ -z $REMOTE_LOAD ]]; then
# Index all lzo files in HDFS under /test-warehouse
${IMPALA_HOME}/testdata/bin/lzo_indexer.sh /test-warehouse
fi
hadoop fs -mv /bad_text_lzo_text_lzo/ /test-warehouse/
# Remove all index files in this partition.
hadoop fs -rm -f /test-warehouse/alltypes_text_lzo/year=2009/month=1/*.lzo.index
# Add a sequence file that only contains a header (see IMPALA-362)
hadoop fs -put -f ${IMPALA_HOME}/testdata/tinytable_seq_snap/tinytable_seq_snap_header_only \
/test-warehouse/tinytable_seq_snap

View File

@@ -171,7 +171,6 @@ COMPRESSION_MAP = {'def': 'org.apache.hadoop.io.compress.DefaultCodec',
'gzip': 'org.apache.hadoop.io.compress.GzipCodec',
'bzip': 'org.apache.hadoop.io.compress.BZip2Codec',
'snap': 'org.apache.hadoop.io.compress.SnappyCodec',
'lzo': 'com.hadoop.compression.lzo.LzopCodec',
'none': ''
}
@@ -188,9 +187,6 @@ FILE_FORMAT_MAP = {
'orc': 'ORC',
'parquet': 'PARQUET',
'hudiparquet': 'HUDIPARQUET',
'text_lzo':
"\nINPUTFORMAT 'com.hadoop.mapred.DeprecatedLzoTextInputFormat'" +
"\nOUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'",
'avro': 'AVRO',
'hbase': "'org.apache.hadoop.hive.hbase.HBaseStorageHandler'",
'kudu': "KUDU",
@@ -224,7 +220,7 @@ WITH SERDEPROPERTIES (
"{hbase_column_mapping}")
{tbl_properties}{{hdfs_location}}"""
KNOWN_EXPLORATION_STRATEGIES = ['core', 'pairwise', 'exhaustive', 'lzo']
KNOWN_EXPLORATION_STRATEGIES = ['core', 'pairwise', 'exhaustive']
def build_create_statement(table_template, table_name, db_name, db_suffix,
file_format, compression, hdfs_location,
@@ -232,8 +228,6 @@ def build_create_statement(table_template, table_name, db_name, db_suffix,
create_stmt = ''
if (force_reload):
create_stmt += 'DROP TABLE IF EXISTS %s%s.%s;\n' % (db_name, db_suffix, table_name)
if compression == 'lzo':
file_format = '%s_%s' % (file_format, compression)
# hbase / kudu tables are external, and not read from hdfs. We don't need an
# hdfs_location.
if file_format in ['hbase', 'kudu']:
@@ -454,9 +448,9 @@ def build_insert_into_statement(insert, db_name, db_suffix, table_name, file_for
statement += "set hive.auto.convert.join=true;\n"
# For some reason (hive bug?) we need to have the CombineHiveInputFormat set
# for cases where we are compressing in bzip or lzo on certain tables that
# for cases where we are compressing in bzip on certain tables that
# have multiple files.
if 'multi' in table_name and ('bzip' in db_suffix or 'lzo' in db_suffix):
if 'multi' in table_name and ('bzip' in db_suffix):
statement += SET_HIVE_INPUT_FORMAT % "CombineHiveInputFormat"
else:
statement += SET_HIVE_INPUT_FORMAT % "HiveInputFormat"
@@ -682,9 +676,6 @@ def generate_statements(output_name, test_vectors, sections,
output = impala_create
if create_hive or file_format == 'hbase':
output = hive_output
elif codec == 'lzo':
# Impala CREATE TABLE doesn't allow INPUTFORMAT.
output = hive_output
# TODO: Currently, Kudu does not support partitioned tables via Impala.
# If a CREATE_KUDU section was provided, assume it handles the partition columns
@@ -748,21 +739,7 @@ def generate_statements(output_name, test_vectors, sections,
# moment, it assumes we're only using ALTER for partitioning the table.
if alter and file_format not in ("hbase", "kudu"):
use_db = 'USE {db_name};\n'.format(db_name=db)
if output == hive_output and codec == 'lzo':
# Hive ALTER TABLE ADD PARTITION doesn't handle null partitions, so
# we can't run the ALTER section in this case.
if options.force_reload:
# IMPALA-2278: Hive INSERT OVERWRITE won't clear out partition directories
# that weren't already added to the table. So, for force reload, manually
# delete the partition directories.
output.create.append(("DFS -rm -R {data_path};").format(
data_path=data_path))
else:
# If this is not a force reload use msck repair to add the partitions
# into the table.
output.create.append(use_db + 'msck repair table %s;' % (table_name))
else:
output.create.append(use_db + alter.format(table_name=table_name))
output.create.append(use_db + alter.format(table_name=table_name))
# If the directory already exists in HDFS, assume that data files already exist
# and skip loading the data. Otherwise, the data is generated using either an

View File

@@ -92,7 +92,6 @@ def is_valid_combination(vector):
if len(vector) == 4:
return not (
(vector[FILE_FORMAT_IDX] == 'text' and vector[COMPRESSION_IDX] in ['def']) or
(vector[FILE_FORMAT_IDX] != 'text' and vector[COMPRESSION_IDX] == 'lzo') or
(vector[COMPRESSION_IDX] == 'none' and vector[COMPRESSION_TYPE_IDX] != 'none') or
(vector[COMPRESSION_IDX] != 'none' and vector[COMPRESSION_TYPE_IDX] == 'none') or
(vector[FILE_FORMAT_IDX] != 'seq' and vector[COMPRESSION_TYPE_IDX] == 'record') or

View File

@@ -44,8 +44,7 @@ COMPRESSION_VALUES_MAP = {
"parquet": {
"none": "SNAPPY",
"snap": "SNAPPY",
"gzip": "GZIP",
"lzo": "LZO"
"gzip": "GZIP"
},
# Currently, only three codecs are supported in Hive for ORC. See Hive codes in
# org.apache.orc.impl.WriterImpl#createCodec (in module hive-orc)
@@ -397,7 +396,7 @@ if __name__ == "__main__":
source_db = args.source_db
target_db = args.target_db
file_format, compression_value = args.table_format.split("/")
# 'compression_value' is one of [none,def,gzip,bzip,snap,lzo]. We should translate it
# 'compression_value' is one of [none,def,gzip,bzip,snap]. We should translate it
# into values that can be set to Hive.
if file_format not in COMPRESSION_KEYS_MAP:
raise Exception("Nested types in file format %s are not supported" % file_format)

View File

@@ -1,20 +0,0 @@
#!/bin/bash
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
hadoop jar ${HADOOP_LZO}/build/hadoop-lzo-0.4.15.jar com.hadoop.compression.lzo.DistributedLzoIndexer $*

View File

@@ -26,8 +26,6 @@ target_filesystem = os.environ.get('TARGET_FILESYSTEM')
compression_codecs = [
'org.apache.hadoop.io.compress.GzipCodec',
'org.apache.hadoop.io.compress.DefaultCodec',
'com.hadoop.compression.lzo.LzoCodec',
'com.hadoop.compression.lzo.LzopCodec',
'org.apache.hadoop.io.compress.BZip2Codec'
]
@@ -44,7 +42,6 @@ CONFIG = {
# Compression codecs
'io.compression.codecs': ",".join(compression_codecs),
'io.compression.deoc.lzo.class': 'com.hadoop.compression.lzo.LzoCodec',
# Set up proxyuser
'hadoop.proxyuser.${USER}.hosts': '*',

View File

@@ -76,9 +76,7 @@ app_classpath = [
'$HADOOP_HDFS_HOME/share/hadoop/hdfs/*',
'$HADOOP_HDFS_HOME/share/hadoop/hdfs/lib/*',
'$HADOOP_YARN_HOME/share/hadoop/yarn/*',
'$HADOOP_YARN_HOME/share/hadoop/yarn/lib/*',
# Append the LZO jar for LZO-compressed file support.
'${LZO_JAR_PATH}']
'$HADOOP_YARN_HOME/share/hadoop/yarn/lib/*']
# Hive 3 needs Tez on the classpath.
if hive_major_version == 3:

View File

@@ -1563,17 +1563,6 @@ OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name};
---- DATASET
functional
---- BASE_TABLE_NAME
bad_text_lzo
---- COLUMNS
field STRING
---- DEPENDENT_LOAD_HIVE
-- Error recovery test data for LZO compression.
LOAD DATA LOCAL INPATH '{impala_home}/testdata/bad_text_lzo/bad_text.lzo'
OVERWRITE INTO TABLE {db_name}{db_suffix}.{table_name};
====
---- DATASET
functional
---- BASE_TABLE_NAME
bad_text_gzip
---- COLUMNS
s STRING

View File

@@ -33,7 +33,6 @@ table_name:insert_overwrite_partitioned, constraint:restrict_to, table_format:pa
table_name:insert_string_partitioned, constraint:restrict_to, table_format:parquet/none/none
table_name:old_rcfile_table, constraint:restrict_to, table_format:rc/none/none
table_name:bad_text_lzo, constraint:restrict_to, table_format:text/lzo/block
table_name:bad_text_gzip, constraint:restrict_to, table_format:text/gzip/block
table_name:bad_seq_snap, constraint:restrict_to, table_format:seq/snap/block
table_name:bad_avro_snap_strings, constraint:restrict_to, table_format:avro/snap/block
@@ -242,13 +241,11 @@ table_name:date_tbl, constraint:restrict_to, table_format:avro/snap/block
table_name:date_tbl, constraint:restrict_to, table_format:orc/def/block
table_name:date_tbl, constraint:restrict_to, table_format:hbase/none/none
table_name:date_tbl, constraint:restrict_to, table_format:text/none/none
table_name:date_tbl, constraint:restrict_to, table_format:text/lzo/block
table_name:date_tbl, constraint:restrict_to, table_format:text/bzip/block
table_name:date_tbl, constraint:restrict_to, table_format:text/gzip/block
table_name:date_tbl, constraint:restrict_to, table_format:text/snap/block
table_name:date_tbl, constraint:restrict_to, table_format:text/def/block
table_name:date_tbl_error, constraint:restrict_to, table_format:text/none/none
table_name:date_tbl_error, constraint:restrict_to, table_format:text/lzo/block
table_name:date_tbl_error, constraint:restrict_to, table_format:text/bzip/block
table_name:date_tbl_error, constraint:restrict_to, table_format:text/gzip/block
table_name:date_tbl_error, constraint:restrict_to, table_format:text/snap/block
@@ -280,7 +277,6 @@ table_name:bucketed_ext_table, constraint:exclude, table_format:hbase/none/none
table_name:bucketed_ext_table, constraint:exclude, table_format:kudu/none/none
table_name:bucketed_table, constraint:exclude, table_format:hbase/none/none
table_name:bucketed_table, constraint:exclude, table_format:kudu/none/none
table_name:bucketed_table, constraint:exclude, table_format:text/lzo/block
# The uncompressed ORC tables are mainly used in test_scanners_fuzz.py to avoid creating
# them each time when running the test. Developers may run this test many times locally.
1 # Table level constraints:
33 table_name:bad_seq_snap, constraint:restrict_to, table_format:seq/snap/block table_name:bad_avro_snap_strings, constraint:restrict_to, table_format:avro/snap/block
34 table_name:bad_avro_snap_strings, constraint:restrict_to, table_format:avro/snap/block table_name:bad_avro_snap_floats, constraint:restrict_to, table_format:avro/snap/block
35 table_name:bad_avro_snap_floats, constraint:restrict_to, table_format:avro/snap/block table_name:bad_avro_decimal_schema, constraint:restrict_to, table_format:avro/snap/block
table_name:bad_avro_decimal_schema, constraint:restrict_to, table_format:avro/snap/block
36 table_name:bad_avro_date_out_of_range, constraint:restrict_to, table_format:avro/snap/block
37 table_name:hive2_bad_avro_date_pre_gregorian, constraint:restrict_to, table_format:avro/snap/block
38 table_name:hive3_avro_date_pre_gregorian, constraint:restrict_to, table_format:avro/snap/block
241 table_name:bucketed_ext_table, constraint:exclude, table_format:hbase/none/none table_name:bucketed_table, constraint:exclude, table_format:hbase/none/none
242 table_name:bucketed_ext_table, constraint:exclude, table_format:kudu/none/none table_name:bucketed_table, constraint:exclude, table_format:kudu/none/none
243 table_name:bucketed_table, constraint:exclude, table_format:hbase/none/none # The uncompressed ORC tables are mainly used in test_scanners_fuzz.py to avoid creating
table_name:bucketed_table, constraint:exclude, table_format:kudu/none/none
244 table_name:bucketed_table, constraint:exclude, table_format:text/lzo/block # them each time when running the test. Developers may run this test many times locally.
245 # The uncompressed ORC tables are mainly used in test_scanners_fuzz.py to avoid creating table_name:uncomp_src_alltypes, constraint:restrict_to, table_format:orc/def/block
246 # them each time when running the test. Developers may run this test many times locally. table_name:uncomp_src_decimal_tbl, constraint:restrict_to, table_format:orc/def/block
247 table_name:uncomp_src_alltypes, constraint:restrict_to, table_format:orc/def/block table_name:part_strings_with_quotes, constraint:restrict_to, table_format:text/none/none
248
table_name:part_strings_with_quotes, constraint:restrict_to, table_format:text/none/none
249
250
251
277
278
279
280
281
282

View File

@@ -531,8 +531,8 @@ PLAN-ROOT SINK
# join involving tables with no table stats
# one of the tables (alltypes) is a compressed text file
# tests that the default join strategy is broadcast
select * from functional_text_lzo.emptytable a inner join
functional_text_lzo.alltypes b on a.f2 = b.int_col
select * from functional_text_gzip.emptytable a inner join
functional_text_gzip.alltypes b on a.f2 = b.int_col
---- PLAN
PLAN-ROOT SINK
|
@@ -541,11 +541,11 @@ PLAN-ROOT SINK
| runtime filters: RF000 <- a.f2
| row-size=96B cardinality=5.65K
|
|--00:SCAN HDFS [functional_text_lzo.emptytable a]
|--00:SCAN HDFS [functional_text_gzip.emptytable a]
| partitions=0/0 files=0 size=0B
| row-size=16B cardinality=0
|
01:SCAN HDFS [functional_text_lzo.alltypes b]
01:SCAN HDFS [functional_text_gzip.alltypes b]
HDFS partitions=24/24 files=24 size=123.32KB
runtime filters: RF000 -> b.int_col
row-size=80B cardinality=5.65K

View File

@@ -3056,8 +3056,8 @@ PLAN-ROOT SINK
# join involving tables with no table stats
# one of the tables (alltypes) is a compressed text file
# tests that the default join strategy is broadcast
select * from functional_text_lzo.emptytable a inner join
functional_text_lzo.alltypes b on a.f2 = b.int_col
select * from functional_text_gzip.emptytable a inner join
functional_text_gzip.alltypes b on a.f2 = b.int_col
---- PLAN
PLAN-ROOT SINK
|
@@ -3066,11 +3066,11 @@ PLAN-ROOT SINK
| runtime filters: RF000 <- b.int_col
| row-size=96B cardinality=0
|
|--01:SCAN HDFS [functional_text_lzo.alltypes b]
|--01:SCAN HDFS [functional_text_gzip.alltypes b]
| HDFS partitions=24/24 files=24 size=123.32KB
| row-size=80B cardinality=unavailable
|
00:SCAN HDFS [functional_text_lzo.emptytable a]
00:SCAN HDFS [functional_text_gzip.emptytable a]
partitions=0/0 files=0 size=0B
runtime filters: RF000 -> a.f2
row-size=16B cardinality=0

View File

@@ -1,4 +1,4 @@
file_format: text,seq,rc,avro,parquet,orc,hbase,kudu
dataset: functional
compression_codec: none,def,gzip,bzip,snap,lzo
compression_codec: none,def,gzip,bzip,snap
compression_type: none,block,record
1 file_format: text,seq,rc,avro,parquet,orc,hbase,kudu
2 dataset: functional
3 compression_codec: none,def,gzip,bzip,snap,lzo compression_codec: none,def,gzip,bzip,snap
4 compression_type: none,block,record

View File

@@ -4,7 +4,6 @@ file_format: text, dataset: functional, compression_codec: def, compression_type
file_format: text, dataset: functional, compression_codec: gzip, compression_type: block
file_format: text, dataset: functional, compression_codec: bzip, compression_type: block
file_format: text, dataset: functional, compression_codec: snap, compression_type: block
file_format: text, dataset: functional, compression_codec: lzo, compression_type: block
file_format: seq, dataset: functional, compression_codec: none, compression_type: none
file_format: seq, dataset: functional, compression_codec: def, compression_type: block
file_format: seq, dataset: functional, compression_codec: def, compression_type: record
1 # Generated File.
4 file_format: text, dataset: functional, compression_codec: gzip, compression_type: block
5 file_format: text, dataset: functional, compression_codec: bzip, compression_type: block
6 file_format: text, dataset: functional, compression_codec: snap, compression_type: block
file_format: text, dataset: functional, compression_codec: lzo, compression_type: block
7 file_format: seq, dataset: functional, compression_codec: none, compression_type: none
8 file_format: seq, dataset: functional, compression_codec: def, compression_type: block
9 file_format: seq, dataset: functional, compression_codec: def, compression_type: record

View File

@@ -107,24 +107,6 @@ row_regex: .*Error parsing row: file: $NAMENODE/.* before offset: \d+
INT, DATE, DATE
====
---- QUERY
select count(*) from functional_text_lzo.bad_text_lzo
---- ERRORS
Blocksize: 536870911 is greater than LZO_MAX_BLOCK_SIZE: 67108864
---- RESULTS
5141
---- TYPES
bigint
====
---- QUERY
select count(field) from functional_text_lzo.bad_text_lzo
---- ERRORS
Blocksize: 536870911 is greater than LZO_MAX_BLOCK_SIZE: 67108864
---- RESULTS
5141
---- TYPES
bigint
====
---- QUERY
select * from alltypeserrornonulls
---- ERRORS
Error converting column: 3 to SMALLINT

View File

@@ -1,7 +0,0 @@
====
---- QUERY
# Test that running with plugin disabled fails gracefully.
select * from functional_text_lzo.alltypes
---- CATCH
Scanner plugin 'LZO' is not one of the enabled plugins: ''
====

View File

@@ -371,18 +371,6 @@ LOCATION '$$location_uri$$'
TBLPROPERTIES ('external.table.purge'='TRUE')
====
---- QUERY
SHOW CREATE TABLE functional_text_lzo.tinytable
---- RESULTS-HIVE
CREATE EXTERNAL TABLE functional_text_lzo.tinytable (
a STRING,
b STRING
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
STORED AS INPUTFORMAT 'com.hadoop.mapred.DeprecatedLzoTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION '$$location_uri$$'
====
---- QUERY
SHOW CREATE TABLE functional.allcomplextypes
---- RESULTS-HIVE
CREATE EXTERNAL TABLE functional.allcomplextypes (

View File

@@ -13,7 +13,7 @@ BIGINT
select count(*)
from multi_text_compression where month <= 3
---- CATCH
Scanner plugin 'LZ4' is not one of the enabled plugins: 'LZO'
Scanner plugin 'LZ4' is not one of the enabled plugins: ''
====
---- QUERY
# Unknown compression suffix is treated as uncompressed text.
@@ -26,3 +26,10 @@ INT
Error converting column: 0 to INT
Error parsing row: file: __HDFS_FILENAME__, before offset: 16
====
---- QUERY
# Test that querying partition with unsupported plugin fails gracefully.
select count(*)
from multi_text_compression where month = 5
---- CATCH
Scanner plugin 'LZO' is not one of the enabled plugins: ''
====

View File

@@ -1,4 +1,4 @@
file_format: text,seq
dataset: tpch
compression_codec: none,def,gzip,bzip,snap,lzo
compression_codec: none,def,gzip,bzip,snap
compression_type: none,block,record
1 file_format: text,seq
2 dataset: tpch
3 compression_codec: none,def,gzip,bzip,snap,lzo compression_codec: none,def,gzip,bzip,snap
4 compression_type: none,block,record

View File

@@ -1,6 +1,5 @@
# Generated File.
file_format: text, dataset: tpch, compression_codec: none, compression_type: none
file_format: text, dataset: tpch, compression_codec: lzo, compression_type: block
file_format: seq, dataset: tpch, compression_codec: none, compression_type: none
file_format: seq, dataset: tpch, compression_codec: def, compression_type: block
file_format: seq, dataset: tpch, compression_codec: def, compression_type: record
1 # Generated File.
2 file_format: text, dataset: tpch, compression_codec: none, compression_type: none
file_format: text, dataset: tpch, compression_codec: lzo, compression_type: block
3 file_format: seq, dataset: tpch, compression_codec: none, compression_type: none
4 file_format: seq, dataset: tpch, compression_codec: def, compression_type: block
5 file_format: seq, dataset: tpch, compression_codec: def, compression_type: record

View File

@@ -2,4 +2,3 @@
file_format: text, dataset: tpch, compression_codec: none, compression_type: none
file_format: seq, dataset: tpch, compression_codec: def, compression_type: block
file_format: seq, dataset: tpch, compression_codec: gzip, compression_type: record
file_format: text, dataset: tpch, compression_codec: lzo, compression_type: block
1 # Generated File.
2 file_format: text, dataset: tpch, compression_codec: none, compression_type: none
3 file_format: seq, dataset: tpch, compression_codec: def, compression_type: block
4 file_format: seq, dataset: tpch, compression_codec: gzip, compression_type: record
file_format: text, dataset: tpch, compression_codec: lzo, compression_type: block

View File

@@ -1,4 +1,4 @@
file_format: text,seq,rc,avro,parquet,kudu
dataset: tpch
compression_codec: none,def,gzip,bzip,snap,lzo
compression_codec: none,def,gzip,bzip,snap
compression_type: none,block,record
1 file_format: text,seq,rc,avro,parquet,kudu
2 dataset: tpch
3 compression_codec: none,def,gzip,bzip,snap,lzo compression_codec: none,def,gzip,bzip,snap
4 compression_type: none,block,record

View File

@@ -1,6 +1,5 @@
# Generated File.
file_format: text, dataset: tpch, compression_codec: none, compression_type: none
file_format: text, dataset: tpch, compression_codec: lzo, compression_type: block
file_format: seq, dataset: tpch, compression_codec: none, compression_type: none
file_format: seq, dataset: tpch, compression_codec: def, compression_type: block
file_format: seq, dataset: tpch, compression_codec: def, compression_type: record
1 # Generated File.
2 file_format: text, dataset: tpch, compression_codec: none, compression_type: none
file_format: text, dataset: tpch, compression_codec: lzo, compression_type: block
3 file_format: seq, dataset: tpch, compression_codec: none, compression_type: none
4 file_format: seq, dataset: tpch, compression_codec: def, compression_type: block
5 file_format: seq, dataset: tpch, compression_codec: def, compression_type: record

View File

@@ -2,4 +2,3 @@
file_format: text, dataset: tpch, compression_codec: none, compression_type: none
file_format: seq, dataset: tpch, compression_codec: def, compression_type: block
file_format: seq, dataset: tpch, compression_codec: gzip, compression_type: record
file_format: text, dataset: tpch, compression_codec: lzo, compression_type: block
1 # Generated File.
2 file_format: text, dataset: tpch, compression_codec: none, compression_type: none
3 file_format: seq, dataset: tpch, compression_codec: def, compression_type: block
4 file_format: seq, dataset: tpch, compression_codec: gzip, compression_type: record
file_format: text, dataset: tpch, compression_codec: lzo, compression_type: block

View File

@@ -1,4 +1,4 @@
file_format: text,seq, parquet
dataset: tpch
compression_codec: none,def,gzip,bzip,snap,lzo
compression_codec: none,def,gzip,bzip,snap
compression_type: none,block,record
1 file_format: text,seq, parquet
2 dataset: tpch
3 compression_codec: none,def,gzip,bzip,snap,lzo compression_codec: none,def,gzip,bzip,snap
4 compression_type: none,block,record

View File

@@ -1,6 +1,5 @@
# Generated File.
file_format: text, dataset: tpch, compression_codec: none, compression_type: none
file_format: text, dataset: tpch, compression_codec: lzo, compression_type: block
file_format: seq, dataset: tpch, compression_codec: none, compression_type: none
file_format: seq, dataset: tpch, compression_codec: def, compression_type: block
file_format: seq, dataset: tpch, compression_codec: def, compression_type: record
1 # Generated File.
2 file_format: text, dataset: tpch, compression_codec: none, compression_type: none
file_format: text, dataset: tpch, compression_codec: lzo, compression_type: block
3 file_format: seq, dataset: tpch, compression_codec: none, compression_type: none
4 file_format: seq, dataset: tpch, compression_codec: def, compression_type: block
5 file_format: seq, dataset: tpch, compression_codec: def, compression_type: record

View File

@@ -2,4 +2,3 @@
file_format: text, dataset: tpch, compression_codec: none, compression_type: none
file_format: seq, dataset: tpch, compression_codec: def, compression_type: block
file_format: seq, dataset: tpch, compression_codec: gzip, compression_type: record
file_format: text, dataset: tpch, compression_codec: lzo, compression_type: block
1 # Generated File.
2 file_format: text, dataset: tpch, compression_codec: none, compression_type: none
3 file_format: seq, dataset: tpch, compression_codec: def, compression_type: block
4 file_format: seq, dataset: tpch, compression_codec: gzip, compression_type: record
file_format: text, dataset: tpch, compression_codec: lzo, compression_type: block

View File

@@ -1,4 +1,4 @@
file_format: text,seq,rc,avro,parquet
dataset: tpcds
compression_codec: none,def,gzip,bzip,snap,lzo
compression_codec: none,def,gzip,bzip,snap
compression_type: none,block,record
1 file_format: text,seq,rc,avro,parquet
2 dataset: tpcds
3 compression_codec: none,def,gzip,bzip,snap,lzo compression_codec: none,def,gzip,bzip,snap
4 compression_type: none,block,record

View File

@@ -1,6 +1,5 @@
# Generated File.
file_format: text, dataset: tpcds, compression_codec: none, compression_type: none
file_format: text, dataset: tpcds, compression_codec: lzo, compression_type: block
file_format: seq, dataset: tpcds, compression_codec: none, compression_type: none
file_format: seq, dataset: tpcds, compression_codec: def, compression_type: block
file_format: seq, dataset: tpcds, compression_codec: def, compression_type: record
1 # Generated File.
2 file_format: text, dataset: tpcds, compression_codec: none, compression_type: none
file_format: text, dataset: tpcds, compression_codec: lzo, compression_type: block
3 file_format: seq, dataset: tpcds, compression_codec: none, compression_type: none
4 file_format: seq, dataset: tpcds, compression_codec: def, compression_type: block
5 file_format: seq, dataset: tpcds, compression_codec: def, compression_type: record

View File

@@ -8,7 +8,6 @@ file_format: parquet, dataset: tpcds, compression_codec: def, compression_type:
file_format: avro, dataset: tpcds, compression_codec: def, compression_type: block
file_format: rc, dataset: tpcds, compression_codec: bzip, compression_type: block
file_format: seq, dataset: tpcds, compression_codec: snap, compression_type: record
file_format: text, dataset: tpcds, compression_codec: lzo, compression_type: block
file_format: rc, dataset: tpcds, compression_codec: def, compression_type: block
file_format: avro, dataset: tpcds, compression_codec: none, compression_type: none
file_format: parquet, dataset: tpcds, compression_codec: none, compression_type: none
1 # Generated File.
8 file_format: avro, dataset: tpcds, compression_codec: def, compression_type: block
9 file_format: rc, dataset: tpcds, compression_codec: bzip, compression_type: block
10 file_format: seq, dataset: tpcds, compression_codec: snap, compression_type: record
file_format: text, dataset: tpcds, compression_codec: lzo, compression_type: block
11 file_format: rc, dataset: tpcds, compression_codec: def, compression_type: block
12 file_format: avro, dataset: tpcds, compression_codec: none, compression_type: none
13 file_format: parquet, dataset: tpcds, compression_codec: none, compression_type: none

View File

@@ -1,4 +1,4 @@
file_format: text,seq,rc,avro,parquet,orc
dataset: tpcds
compression_codec: none,def,gzip,bzip,snap,lzo
compression_codec: none,def,gzip,bzip,snap
compression_type: none,block,record
1 file_format: text,seq,rc,avro,parquet,orc
2 dataset: tpcds
3 compression_codec: none,def,gzip,bzip,snap,lzo compression_codec: none,def,gzip,bzip,snap
4 compression_type: none,block,record

View File

@@ -1,6 +1,5 @@
# Generated File.
file_format: text, dataset: tpcds, compression_codec: none, compression_type: none
file_format: text, dataset: tpcds, compression_codec: lzo, compression_type: block
file_format: seq, dataset: tpcds, compression_codec: none, compression_type: none
file_format: seq, dataset: tpcds, compression_codec: def, compression_type: block
file_format: seq, dataset: tpcds, compression_codec: def, compression_type: record
1 # Generated File.
2 file_format: text, dataset: tpcds, compression_codec: none, compression_type: none
file_format: text, dataset: tpcds, compression_codec: lzo, compression_type: block
3 file_format: seq, dataset: tpcds, compression_codec: none, compression_type: none
4 file_format: seq, dataset: tpcds, compression_codec: def, compression_type: block
5 file_format: seq, dataset: tpcds, compression_codec: def, compression_type: record

View File

@@ -8,7 +8,6 @@ file_format: parquet, dataset: tpcds, compression_codec: def, compression_type:
file_format: avro, dataset: tpcds, compression_codec: def, compression_type: block
file_format: rc, dataset: tpcds, compression_codec: bzip, compression_type: block
file_format: seq, dataset: tpcds, compression_codec: snap, compression_type: record
file_format: text, dataset: tpcds, compression_codec: lzo, compression_type: block
file_format: rc, dataset: tpcds, compression_codec: def, compression_type: block
file_format: avro, dataset: tpcds, compression_codec: none, compression_type: none
file_format: parquet, dataset: tpcds, compression_codec: none, compression_type: none
1 # Generated File.
8 file_format: avro, dataset: tpcds, compression_codec: def, compression_type: block
9 file_format: rc, dataset: tpcds, compression_codec: bzip, compression_type: block
10 file_format: seq, dataset: tpcds, compression_codec: snap, compression_type: record
file_format: text, dataset: tpcds, compression_codec: lzo, compression_type: block
11 file_format: rc, dataset: tpcds, compression_codec: def, compression_type: block
12 file_format: avro, dataset: tpcds, compression_codec: none, compression_type: none
13 file_format: parquet, dataset: tpcds, compression_codec: none, compression_type: none

View File

@@ -1,4 +1,4 @@
file_format: text,seq,rc,avro,parquet,orc,kudu
dataset: tpch
compression_codec: none,def,gzip,bzip,snap,lzo
compression_codec: none,def,gzip,bzip,snap
compression_type: none,block,record
1 file_format: text,seq,rc,avro,parquet,orc,kudu
2 dataset: tpch
3 compression_codec: none,def,gzip,bzip,snap,lzo compression_codec: none,def,gzip,bzip,snap
4 compression_type: none,block,record

View File

@@ -1,7 +1,6 @@
# Generated File.
file_format: text, dataset: tpch, compression_codec: none, compression_type: none
file_format: text, dataset: tpch, compression_codec: gzip, compression_type: block
file_format: text, dataset: tpch, compression_codec: lzo, compression_type: block
file_format: seq, dataset: tpch, compression_codec: none, compression_type: none
file_format: seq, dataset: tpch, compression_codec: def, compression_type: block
file_format: seq, dataset: tpch, compression_codec: def, compression_type: record
1 # Generated File.
2 file_format: text, dataset: tpch, compression_codec: none, compression_type: none
3 file_format: text, dataset: tpch, compression_codec: gzip, compression_type: block
file_format: text, dataset: tpch, compression_codec: lzo, compression_type: block
4 file_format: seq, dataset: tpch, compression_codec: none, compression_type: none
5 file_format: seq, dataset: tpch, compression_codec: def, compression_type: block
6 file_format: seq, dataset: tpch, compression_codec: def, compression_type: record

View File

@@ -8,7 +8,6 @@ file_format: parquet, dataset: tpch, compression_codec: def, compression_type: b
file_format: avro, dataset: tpch, compression_codec: def, compression_type: block
file_format: rc, dataset: tpch, compression_codec: bzip, compression_type: block
file_format: seq, dataset: tpch, compression_codec: snap, compression_type: record
file_format: text, dataset: tpch, compression_codec: lzo, compression_type: block
file_format: rc, dataset: tpch, compression_codec: def, compression_type: block
file_format: avro, dataset: tpch, compression_codec: none, compression_type: none
file_format: parquet, dataset: tpch, compression_codec: none, compression_type: none
1 # Generated File.
8 file_format: avro, dataset: tpch, compression_codec: def, compression_type: block
9 file_format: rc, dataset: tpch, compression_codec: bzip, compression_type: block
10 file_format: seq, dataset: tpch, compression_codec: snap, compression_type: record
file_format: text, dataset: tpch, compression_codec: lzo, compression_type: block
11 file_format: rc, dataset: tpch, compression_codec: def, compression_type: block
12 file_format: avro, dataset: tpch, compression_codec: none, compression_type: none
13 file_format: parquet, dataset: tpch, compression_codec: none, compression_type: none

View File

@@ -32,7 +32,7 @@ class TableFormatInfo(object):
KNOWN_FILE_FORMATS = ['text', 'seq', 'rc', 'parquet', 'orc', 'avro', 'hbase']
if os.environ['KUDU_IS_SUPPORTED'] == 'true':
KNOWN_FILE_FORMATS.append('kudu')
KNOWN_COMPRESSION_CODECS = ['none', 'snap', 'gzip', 'bzip', 'def', 'lzo', 'zstd', 'lz4']
KNOWN_COMPRESSION_CODECS = ['none', 'snap', 'gzip', 'bzip', 'def', 'zstd', 'lz4']
KNOWN_COMPRESSION_TYPES = ['none', 'block', 'record']
def __init__(self, **kwargs):

View File

@@ -26,7 +26,7 @@ from tests.common.test_dimensions import create_exec_option_dimension
from tests.common.test_result_verifier import verify_query_result_is_equal
# compression codecs impala support reading in text file type
TEXT_CODECS = ['snappy', 'gzip', 'zstd', 'lzo', 'bzip2', 'deflate', 'default']
TEXT_CODECS = ['snappy', 'gzip', 'zstd', 'bzip2', 'deflate', 'default']
class TestTextInterop(CustomClusterTestSuite):
@@ -84,7 +84,6 @@ class TestTextInterop(CustomClusterTestSuite):
'snappy': 'org.apache.hadoop.io.compress.SnappyCodec',
'gzip': 'org.apache.hadoop.io.compress.GzipCodec',
'zstd': 'org.apache.hadoop.io.compress.ZStandardCodec',
'lzo': 'com.hadoop.compression.lzo.LzopCodec',
'bzip2': 'org.apache.hadoop.io.compress.BZip2Codec',
'deflate': 'org.apache.hadoop.io.compress.DeflateCodec',
'default': 'org.apache.hadoop.io.compress.DefaultCodec'

View File

@@ -1,34 +0,0 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import pytest
from tests.common.custom_cluster_test_suite import CustomClusterTestSuite
class TestScannerPlugin(CustomClusterTestSuite):
"""Tests that involve changing the scanner plugin option."""
@classmethod
def get_workload(self):
return 'functional-query'
@pytest.mark.execute_serially
@CustomClusterTestSuite.with_args("--enabled_hdfs_text_scanner_plugins=")
def test_disable_lzo_plugin(self, vector):
"""Test that we can gracefully handle a disabled plugin."""
# Should be able to query valid partitions only.
self.run_test_case('QueryTest/disable-lzo-plugin', vector)

View File

@@ -102,7 +102,7 @@ class TestMetadataQueryStatements(ImpalaTestSuite):
self.exec_and_compare_hive_and_impala_hs2("describe formatted functional.alltypes",
compare=compare_describe_formatted)
self.exec_and_compare_hive_and_impala_hs2(
"describe formatted functional_text_lzo.alltypes",
"describe formatted functional_text_gzip.alltypes",
compare=compare_describe_formatted)
# Describe an unpartitioned table.

View File

@@ -181,7 +181,7 @@ class TestPartitionMetadataUncompressedTextOnly(ImpalaTestSuite):
FQ_TBL_NAME, TBL_LOCATION))
self.__add_alltypes_partition(vector, FQ_TBL_NAME, "functional", 2009, 1)
self.__add_alltypes_partition(vector, FQ_TBL_NAME, "functional_text_lzo", 2009, 2)
self.__add_alltypes_partition(vector, FQ_TBL_NAME, "functional_text_gzip", 2009, 2)
# Create a new partition with a bogus file with the unsupported LZ4 suffix.
lz4_year = 2009
@@ -204,8 +204,18 @@ class TestPartitionMetadataUncompressedTextOnly(ImpalaTestSuite):
"alter table {0} add partition (year={1}, month={2}) location '{3}'".format(
FQ_TBL_NAME, fake_comp_year, fake_comp_month, fake_comp_ym_partition_loc))
# Create a new partition with a bogus file with the now-unsupported LZO suffix
lzo_year = 2009
lzo_month = 5
lzo_ym_partition_loc = self.__make_ym_partition_dir(TBL_LOCATION, lzo_year, lzo_month)
self.filesystem_client.create_file("{0}/fake.lzo".format(lzo_ym_partition_loc)[1:],
"some test data")
self.client.execute(
"alter table {0} add partition (year={1}, month={2}) location '{3}'".format(
FQ_TBL_NAME, lzo_year, lzo_month, lzo_ym_partition_loc))
show_files_result = self.client.execute("show files in {0}".format(FQ_TBL_NAME))
assert len(show_files_result.data) == 4, "Expected one file per partition dir"
assert len(show_files_result.data) == 5, "Expected one file per partition dir"
self.run_test_case('QueryTest/unsupported-compression-partitions', vector,
unique_database)
@@ -222,8 +232,11 @@ class TestPartitionMetadataUncompressedTextOnly(ImpalaTestSuite):
"""Create the year/month partition directory and return the path."""
y_partition_loc = "{0}/year={1}".format(tbl_location, year)
ym_partition_loc = "{0}/month={1}".format(y_partition_loc, month)
self.filesystem_client.delete_file_dir(tbl_location[1:], recursive=True)
self.filesystem_client.make_dir(tbl_location[1:])
self.filesystem_client.make_dir(y_partition_loc[1:])
if not self.filesystem_client.exists(tbl_location[1:]):
self.filesystem_client.make_dir(tbl_location[1:])
if not self.filesystem_client.exists(y_partition_loc[1:]):
self.filesystem_client.make_dir(y_partition_loc[1:])
if self.filesystem_client.exists(ym_partition_loc[1:]):
self.filesystem_client.delete_file_dir(ym_partition_loc[1:], recursive=True)
self.filesystem_client.make_dir(ym_partition_loc[1:])
return ym_partition_loc

View File

@@ -75,7 +75,6 @@ class TestCompressedFormats(ImpalaTestSuite):
file_format = vector.get_value('file_format')
extension, suffix = vector.get_value('compression_format')
if file_format in ['rc', 'seq']:
# TODO: How about LZO?
# Test that {gzip,snappy,bzip,deflate}-compressed
# {RC,sequence,text} files are supported.
db_suffix = '_%s_%s' % (file_format, suffix)

View File

@@ -72,7 +72,7 @@ class TestScannersFuzzing(ImpalaTestSuite):
cls.ImpalaTestMatrix.add_constraint(lambda v:
v.get_value('table_format').file_format in ('avro', 'parquet', 'orc') or
(v.get_value('table_format').file_format == 'text' and
v.get_value('table_format').compression_codec in ('none', 'lzo')))
v.get_value('table_format').compression_codec in ('none')))
def test_fuzz_alltypes(self, vector, unique_database):
@@ -247,8 +247,7 @@ class TestScannersFuzzing(ImpalaTestSuite):
msg = "Should not throw error when abort_on_error=0: '{0}'".format(e)
LOG.error(msg)
# Parquet and compressed text can fail the query for some parse errors.
# E.g. corrupt Parquet footer (IMPALA-3773) or a corrupt LZO index file
# (IMPALA-4013).
# E.g. corrupt Parquet footer (IMPALA-3773)
table_format = vector.get_value('table_format')
if table_format.file_format not in ['parquet', 'orc', 'rc', 'seq'] \
and not (table_format.file_format == 'text' and