mirror of
https://github.com/apache/impala.git
synced 2025-12-23 21:08:39 -05:00
IMPALA-9249: Fix ORC scanner crash when root type is not struct
The root type should be struct as far as I know, and this was checked with a DCHECK, leading to crashes in fuzz tests. This change replaced the DCHECK with returning an error message. Testing: - added corrupt ORC file and e2e test Change-Id: I7fba8cffbcdf8f647e27e2d5ee9e6716a4492b9b Reviewed-on: http://gerrit.cloudera.org:8080/15021 Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
This commit is contained in:
committed by
Impala Public Jenkins
parent
0511b44f92
commit
d836246ab5
@@ -184,8 +184,8 @@ Status HdfsOrcScanner::Open(ScannerContext* context) {
|
||||
// Build 'col_id_path_map_' that maps from ORC column ids to their corresponding
|
||||
// SchemaPath in the table. The map is used in the constructors of OrcColumnReaders
|
||||
// where we resolve SchemaPaths of the descriptors.
|
||||
OrcMetadataUtils::BuildSchemaPaths(reader_->getType(),
|
||||
scan_node_->num_partition_keys(), &col_id_path_map_);
|
||||
RETURN_IF_ERROR(schema_resolver_->BuildSchemaPaths(scan_node_->num_partition_keys(),
|
||||
&col_id_path_map_));
|
||||
|
||||
// To create OrcColumnReaders, we need the selected orc schema. It's a subset of the
|
||||
// file schema: a tree of selected orc types and can only be got from an orc::RowReader
|
||||
|
||||
@@ -22,20 +22,24 @@
|
||||
|
||||
namespace impala {
|
||||
|
||||
void OrcMetadataUtils::BuildSchemaPaths(const orc::Type& root, int num_partition_keys,
|
||||
Status OrcSchemaResolver::BuildSchemaPaths(int num_partition_keys,
|
||||
vector<SchemaPath>* paths) {
|
||||
if (root_->getKind() != orc::TypeKind::STRUCT) {
|
||||
return Status(Substitute("Corrupt ORC File '$0': root type is $1 (should be struct)",
|
||||
filename_, root_->toString()));
|
||||
}
|
||||
SchemaPath path;
|
||||
paths->push_back(path);
|
||||
DCHECK_EQ(root.getKind(), orc::TypeKind::STRUCT);
|
||||
int num_columns = root.getSubtypeCount();
|
||||
int num_columns = root_->getSubtypeCount();
|
||||
for (int i = 0; i < num_columns; ++i) {
|
||||
path.push_back(i + num_partition_keys);
|
||||
BuildSchemaPath(*root.getSubtype(i), &path, paths);
|
||||
BuildSchemaPath(*root_->getSubtype(i), &path, paths);
|
||||
path.pop_back();
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
void OrcMetadataUtils::BuildSchemaPath(const orc::Type& node, SchemaPath* path,
|
||||
void OrcSchemaResolver::BuildSchemaPath(const orc::Type& node, SchemaPath* path,
|
||||
vector<SchemaPath>* paths) {
|
||||
DCHECK_EQ(paths->size(), node.getColumnId());
|
||||
paths->push_back(*path);
|
||||
|
||||
@@ -24,17 +24,6 @@
|
||||
|
||||
namespace impala {
|
||||
|
||||
/// Utils to build a map from each orc::Type id to a SchemaPath. The map will be used in
|
||||
/// creating OrcColumnReaders.
|
||||
class OrcMetadataUtils {
|
||||
public:
|
||||
static void BuildSchemaPaths(const orc::Type& root, int num_partition_keys,
|
||||
std::vector<SchemaPath>* paths);
|
||||
private:
|
||||
static void BuildSchemaPath(const orc::Type& node, SchemaPath* path,
|
||||
std::vector<SchemaPath>* paths);
|
||||
};
|
||||
|
||||
/// Util class to resolve SchemaPaths of TupleDescriptors/SlotDescriptors into orc::Type.
|
||||
class OrcSchemaResolver {
|
||||
public:
|
||||
@@ -47,6 +36,11 @@ class OrcSchemaResolver {
|
||||
/// 'missing_field' is set to true if the column is missing in the ORC file.
|
||||
Status ResolveColumn(const SchemaPath& col_path, const orc::Type** node,
|
||||
bool* pos_field, bool* missing_field) const;
|
||||
|
||||
/// Build a map from each orc::Type id to a SchemaPath. The map will be used in
|
||||
/// creating OrcColumnReaders.
|
||||
Status BuildSchemaPaths(int num_partition_keys, std::vector<SchemaPath>* paths);
|
||||
|
||||
private:
|
||||
const HdfsTableDescriptor& tbl_desc_;
|
||||
const orc::Type* const root_;
|
||||
@@ -55,5 +49,8 @@ class OrcSchemaResolver {
|
||||
/// Validate whether the ColumnType is compatible with the orc type
|
||||
Status ValidateType(const ColumnType& type, const orc::Type& orc_type) const
|
||||
WARN_UNUSED_RESULT;
|
||||
|
||||
static void BuildSchemaPath(const orc::Type& node, SchemaPath* path,
|
||||
std::vector<SchemaPath>* paths);
|
||||
};
|
||||
}
|
||||
|
||||
4
testdata/data/README
vendored
4
testdata/data/README
vendored
@@ -458,3 +458,7 @@ Contains one row (1300-01-01 00:00:00) which is outside Impala's valid time rang
|
||||
|
||||
corrupt_schema.orc:
|
||||
ORC file from IMPALA-9277, generated by fuzz test. The file contains malformed metadata.
|
||||
|
||||
corrupt_root_type.orc:
|
||||
ORC file for IMPALA-9249, generated by fuzz test. The root type of the schema is not
|
||||
struct, which used to hit a DCHECK.
|
||||
|
||||
BIN
testdata/data/corrupt_root_type.orc
vendored
Normal file
BIN
testdata/data/corrupt_root_type.orc
vendored
Normal file
Binary file not shown.
@@ -1334,15 +1334,26 @@ class TestOrc(ImpalaTestSuite):
|
||||
self.run_test_case('DataErrorsTest/orc-out-of-range-timestamp',
|
||||
new_vector, unique_database)
|
||||
|
||||
def test_invalid_schema(self, vector, unique_database):
|
||||
"""Test scanning of ORC file with malformed schema."""
|
||||
test_files = ["testdata/data/corrupt_schema.orc"]
|
||||
def _run_invalid_schema_test(self, unique_database, test_name, expected_error):
|
||||
"""Copies 'test_name'.orc to a table and runs a simple query. These tests should
|
||||
cause an error during the processing of the ORC schema, so the file's columns do
|
||||
not have to match with the table's columns.
|
||||
"""
|
||||
test_files = ["testdata/data/%s.orc" % test_name]
|
||||
create_table_and_copy_files(self.client,
|
||||
"CREATE TABLE {db}.{tbl} (id BIGINT) STORED AS ORC",
|
||||
unique_database, "corrupt_schema", test_files)
|
||||
unique_database, test_name, test_files)
|
||||
err = self.execute_query_expect_failure(self.client,
|
||||
"select count(*) from {0}.{1}".format(unique_database, "corrupt_schema"))
|
||||
assert "Encountered parse error during schema selection" in str(err)
|
||||
"select count(*) from {0}.{1}".format(unique_database, test_name))
|
||||
assert expected_error in str(err)
|
||||
|
||||
def test_invalid_schema(self, vector, unique_database):
|
||||
"""Test scanning of ORC file with malformed schema."""
|
||||
self._run_invalid_schema_test(unique_database, "corrupt_schema",
|
||||
"Encountered parse error during schema selection")
|
||||
self._run_invalid_schema_test(unique_database, "corrupt_root_type",
|
||||
"root type is boolean (should be struct)")
|
||||
|
||||
|
||||
class TestScannerReservation(ImpalaTestSuite):
|
||||
@classmethod
|
||||
|
||||
Reference in New Issue
Block a user