IMPALA-12903: Querying virtual column FILE__POSITION for TEXT and JSON tables crashes Impala

Impala generates segmentation fault when it queries the virtual column
FILE__POSITION for TEXT or JSON tables. When the scanners that do not
support the FILE__POSITION virtual column detect its presence they
try to report an error and close themselves. The segfault is in the
scanners' Close() method when they try to dereference a NULL stream
object.

This patch simply adds NULL-checks in Close().

Alternatively we could detect the presence of FILE__POSITION during
planning in the HdfsScanNode, but doing it in the scanners lets us
handle more queries, e.g. queries that dynamically prune partitions
and the surviving partitions all have file formats that support
FILE__POSITION.

Testing:
 * added negative tests to properly report the errors
 * added tests for mixed file format tables

Change-Id: I8e1af8d526f9046aceddb5944da9e6f9c63768b0
Reviewed-on: http://gerrit.cloudera.org:8080/21148
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
Reviewed-by: Zoltan Borok-Nagy <boroknagyz@cloudera.com>
This commit is contained in:
Zoltan Borok-Nagy
2024-03-14 17:09:51 +01:00
parent c0507c02cd
commit 23c1f0d4e1
5 changed files with 94 additions and 3 deletions

View File

@@ -95,8 +95,10 @@ void HdfsJsonScanner::Close(RowBatch* row_batch) {
// Verify all resources (if any) have been transferred or freed.
DCHECK_EQ(template_tuple_pool_.get()->total_allocated_bytes(), 0);
DCHECK_EQ(data_buffer_pool_.get()->total_allocated_bytes(), 0);
scan_node_->RangeComplete(THdfsFileFormat::JSON,
stream_->file_desc()->file_compression);
if (stream_ != nullptr) {
scan_node_->RangeComplete(THdfsFileFormat::JSON,
stream_->file_desc()->file_compression);
}
CloseInternal();
}

View File

@@ -209,7 +209,7 @@ void HdfsTextScanner::Close(RowBatch* row_batch) {
DCHECK_EQ(template_tuple_pool_.get()->total_allocated_bytes(), 0);
DCHECK_EQ(data_buffer_pool_.get()->total_allocated_bytes(), 0);
DCHECK_EQ(boundary_pool_.get()->total_allocated_bytes(), 0);
if (!only_parsing_header_) {
if (!only_parsing_header_ && stream_ != nullptr) {
scan_node_->RangeComplete(THdfsFileFormat::TEXT,
stream_->file_desc()->file_compression);
}

View File

@@ -155,3 +155,34 @@ order by id;
---- TYPES
BIGINT, BIGINT, INT
====
---- QUERY
# Regression test for IMPALA-12903. The following query uses static pruning. The surviving
# partitions have file formats that support virtual column FILE__POSITION.
select file__position, year, month, id, date_string_col
from functional.alltypesmixedformat
where year=2009 and month = 4 and id < 905
order by id;
---- RESULTS
regex:\d+,2009,4,900,'04/01/09'
regex:\d+,2009,4,901,'04/01/09'
regex:\d+,2009,4,902,'04/01/09'
regex:\d+,2009,4,903,'04/01/09'
regex:\d+,2009,4,904,'04/01/09'
---- TYPES
BIGINT,INT,INT,INT,STRING
====
---- QUERY
# Regression test for IMPALA-12903. The following query uses dynamic pruning. The
# surviving partitions have file formats that support virtual column FILE__POSITION.
select straight_join lhs.file__position, lhs.year, lhs.month, lhs.id
from functional.alltypesmixedformat lhs, functional.alltypes rhs
where lhs.id = rhs.id and lhs.year = rhs.year and lhs.month = rhs.month and
rhs.id > 900 and rhs.id < 900 + rhs.month
order by id;
---- RESULTS
regex:\d+,2009,4,901
regex:\d+,2009,4,902
regex:\d+,2009,4,903
---- TYPES
BIGINT,INT,INT,INT
====

View File

@@ -0,0 +1,41 @@
====
---- QUERY
select file__position from functional.alltypes
---- CATCH
Virtual column FILE__POSITION is not supported for TEXT files.
====
---- QUERY
select file__position from functional_avro.alltypes
---- CATCH
Virtual column FILE__POSITION is not supported for AVRO files.
====
---- QUERY
select file__position from functional_json.alltypes
---- CATCH
Virtual column FILE__POSITION is not supported for JSON files.
====
---- QUERY
select file__position from functional_seq.alltypes
---- CATCH
Virtual column FILE__POSITION is not supported for SEQUENCE_FILE files.
====
---- QUERY
select file__position from functional_rc.alltypes
---- CATCH
Virtual column FILE__POSITION is not supported for RC_FILE files.
====
---- QUERY
select file__position, year, month, id, date_string_col from functional.alltypesmixedformat where year=2009 and month = 3;
---- CATCH
Virtual column FILE__POSITION is not supported for RC_FILE files.
====
---- QUERY
select file__position, year, month, id, date_string_col from functional.alltypesmixedformat where year=2009 and month = 1;
---- CATCH
Virtual column FILE__POSITION is not supported for TEXT files.
====
---- QUERY
select file__position, year, month, id, date_string_col from functional.alltypesmixedformat;
---- CATCH
Virtual column FILE__POSITION is not supported
====

View File

@@ -175,6 +175,23 @@ class TestScannersVirtualColumns(ImpalaTestSuite):
self.run_test_case('QueryTest/mixing-virtual-columns', vector, unique_database)
class TestScannersVirtualColumnsNegative(ImpalaTestSuite):
@classmethod
def get_workload(cls):
return 'functional-query'
@classmethod
def add_test_dimensions(cls):
super(TestScannersVirtualColumnsNegative, cls).add_test_dimensions()
# In the tests we explicitly refer to the databases, i.e. no need to
# run this test with multiple file formats.
cls.ImpalaTestMatrix.add_dimension(
create_uncompressed_text_dimension(cls.get_workload()))
def test_virtual_column_file_position_negative(self, vector):
self.run_test_case('QueryTest/virtual-column-file-position-negative', vector)
class TestIcebergVirtualColumns(ImpalaTestSuite):
@classmethod